import os
import warnings
import subprocess
import importlib

def import_or_install(package):
        try:
            importlib.import_module(package)
            print(f'{package} is already installed')
        except ImportError:
            print(f'{package} is not installed, installing now...')
            subprocess.check_call(['pip', 'install', package])
            print(f'{package} has been installed')

warnings.filterwarnings('ignore')
import_or_install('flaml')
import_or_install('plotnine')
import_or_install('sklego')

import math
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 50)

from pathlib import Path
import random

from plotnine import * 

from sklearn.inspection import permutation_importance
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import mean_absolute_error

from flaml import AutoML
from flaml.automl.model import LGBMEstimator

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression

import lightgbm as lgb

from sklego.linear_model import LADRegression
from sklearn.linear_model import Ridge

from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='retina'

INPUT_DIR = Path('/kaggle/input/playground-series-s3e14/')


TARGET_NAME = 'yield'

N_OUTER_FOLDS = 9
N_FOLDS = 10
N_REPEATS = 10


FIRST_TIME_BUDGET = 1 
MIN_TIME_BUDGET = 1 

INCLUDE_ORIGINAL = True

RANDOM_STATE = 42 

np.random.seed(RANDOM_STATE)


test_data = pd.read_csv(INPUT_DIR / 'test.csv')
test_data['data_type'] = 0

train_data = pd.read_csv(INPUT_DIR / 'train.csv')
train_data['data_type'] = 0

if INCLUDE_ORIGINAL:
    original_data = pd.read_csv('/kaggle/input/wild-blueberry-yield-prediction-dataset/WildBlueberryPollinationSimulationData.csv')
    original_data['data_type'] = 1
    train_data = pd.concat([train_data, original_data.drop(columns=['Row#'])]).reset_index(drop=True)


features = (['MinOfLowerTRange', 'RainingDays', 'fruitmass', 'fruitset', 'seeds', 'bumbles', 'clonesize']
            +  ([] if INCLUDE_ORIGINAL else ['data_type']))

def fe(df):
    # a couple of rows seem odd 
    # probably not enough to make much of a difference
    df['RainingDays'] = np.select(
        condlist=[
            df['RainingDays'] == 26, 
            df['RainingDays'] == 33],
        choicelist= [24, 34],
        default=df['RainingDays'])
    
fe(train_data)
fe(test_data)


unique_targets = np.unique(train_data["yield"])
def mattop_post_process(preds):
     return np.array([min(unique_targets, key = lambda x: abs(x - pred)) for pred in preds])


train_data.filter(features).describe().T


def fold_mae(y, preds, data_type):
    # evaluate against competition training data only
    return mean_absolute_error(y[data_type==0], preds[data_type==0])

def get_fi(automl, estimator_name, X, y):
    # extract permutation feature importance
    # from the model.
    
    fi = pd.DataFrame({
        'estimator_name': [],
        'Feature': [],
        'Importance': []})
    
    try:
        # ensemble - use lgbm
        est = automl.model.named_estimators_['lgbm']
        imp = permutation_importance(est, X, y)
        fi = pd.DataFrame({
            'Importance': imp.importances_mean,
            'Feature': X.columns})
    except:
        try:
            est = automl.model
            imp = permutation_importance(est, X, y)
            fi = pd.DataFrame({
                'Importance': imp.importances_mean,
                'Feature': X.columns})
        except:
            pass
    
    if len(fi) > 0:
        fi = fi.assign(estimator_name=estimator_name)
    
    return fi


class AutoMLFitter:
    # A wrap of the automl call to allow us to
    # decrease the time budget for each call
    # And keep track of the best parameters found
    
    def __init__(self, name, time_budget, estimator_list=[], best_config=None, n_folds=10):
        self.name = name
        self.time_budget = time_budget
        self.estimator_list = estimator_list
        self.best_config = best_config
        self.n_folds = n_folds

    def fit_automl(self, random_state, X, y):

        automl_settings = {
            "time_budget": self.time_budget,
            "metric": 'mae',
            "task": 'regression',
            "log_file_name": "",
            "ensemble": False,
            "eval_method": 'cv',
            "n_splits": self.n_folds,
            "seed": random_state,
            "verbose": 0,
            "estimator_list": self.estimator_list
        }

        # time budget is decreased on each fit.
        self.time_budget //= 1.5
        if self.time_budget < MIN_TIME_BUDGET:
            self.time_budget = MIN_TIME_BUDGET

        automl = AutoML()
        automl.add_learner(learner_name='my_lgbm', learner_class=MyLGBM)
        automl_settings["estimator_list"] = ['my_lgbm' ]  # change the estimator list
        
        automl.fit(X, y, starting_points=self.best_config, **automl_settings)
        self.best_config = automl.best_config_per_estimator

        return automl


def fit_nested_automl(model_fitters, random_state, train_data, test_data, features):
                               
    k_fold = KFold(n_splits=N_OUTER_FOLDS, random_state=random_state, shuffle=True)
    
    fast_fis = []
    oof_metrics = []
    test_preds = np.zeros(len(test_data))
    oof_preds = np.zeros(len(train_data))
    
    print('OOF Metric: ', end='')
    for train_index, test_index in k_fold.split(train_data, train_data[TARGET_NAME]):
        tr, vl = train_data.loc[train_index], train_data.loc[test_index]
        
        # for each model:
        oof_pred = np.zeros(len(vl))
        test_pred = np.zeros(len(test_data))
        best_loss = 0
        
        for model_fitter in model_fitters:
            
            automl = model_fitter.fit_automl(
                random_state, 
                tr.filter(features), 
                tr[TARGET_NAME].values)
        
            oof_pred  += automl.predict(vl.filter(features)) / len(model_fitters)
            test_pred += automl.predict(test_data.filter(features)) / len(model_fitters)
            best_loss += automl.best_loss / (len(model_fitters))
        
            fast_fi =  get_fi(automl, model_fitter.name, vl.filter(features), vl[TARGET_NAME])
            fast_fis.append(fast_fi)
        
        oof_metric = fold_mae(
            vl[TARGET_NAME].values, 
            oof_pred, 
            vl['data_type'].values)
        
        print(f'{oof_metric:4.1f}', end=' ')
        
        oof_preds[test_index] += oof_pred
        test_preds += test_pred / N_OUTER_FOLDS
        fast_fis.append(fast_fi)
        oof_metrics.append(oof_metric)

    return oof_preds, test_preds, fast_fis, oof_metrics


def fit(model_fitters, train_data, test_data, features):
    test_preds = np.zeros(len(test_data))
    oof_preds = np.zeros(len(train_data))
    oof_metrics = []
    oof_fold_metrics = []
    fast_fis = []
    for i in range(N_REPEATS): 
        # fit
        oof_pred, test_pred, fast_fi, oof_fold_metric = fit_nested_automl(
            model_fitters,
            RANDOM_STATE + i,
            train_data.sample(frac=1),
            test_data,
            features)

        oof_metric = fold_mae(
            train_data[TARGET_NAME].values, 
            oof_pred, 
            train_data['data_type'].values)
        
        oof_metric_pp = fold_mae(
            train_data[TARGET_NAME].values, 
            mattop_post_process(oof_pred), 
            train_data['data_type'].values)

        test_preds += (test_pred) / N_REPEATS
        oof_preds += (oof_pred) / N_REPEATS

        oof_fold_metrics.extend(oof_fold_metric)
        fast_fis.extend(fast_fi)

        print(f'| Repeat {i}: {oof_metric:4.1f} mattop postprocessing {oof_metric_pp:4.1f}')
        
    return oof_preds, test_preds, oof_fold_metrics, fast_fis


tr = train_data.sample(frac=1, random_state=RANDOM_STATE)

model_fitter = AutoMLFitter(
    name='model',
    time_budget= FIRST_TIME_BUDGET, 
    n_folds=5) # nested CV: 8/9 * 9/10 = 80%, equivalent to 5 folds

# here's one I made earlier:
# running the fitter with a longer budget will discover these 
# hyperparameters:
model_fitter.best_config = {
    'my_lgbm': {'n_estimators': 265, 'num_leaves': 93, 'min_child_samples': 20, 
                'learning_rate': 0.05533790147941807, 'log_max_bin': 10, 
                'colsample_bytree': 0.8809128870084636, 'reg_alpha': 0.0009765625, 
                'reg_lambda': 0.015589408048174165}}

model_fitter.fit_automl(
    RANDOM_STATE, 
    tr.filter(features), 
    tr[TARGET_NAME].values)

model_fitter.time_budget = MIN_TIME_BUDGET
model_fitter.n_folds = N_FOLDS

print(model_fitter.best_config)

models = [model_fitter]
# model fit & output
oof_preds, test_preds, oof_fold_metrics, fast_fis = fit(models, train_data, test_data, features)

{'my_lgbm': {'n_estimators': 265, 'num_leaves': 93, 'min_child_samples': 20, 'learning_rate': 0.05533790147941807, 'log_max_bin': 10, 'colsample_bytree': 0.8809128870084636, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.01558940804817417}}
OOF Metric: 341.1 343.8 322.7 337.7 333.3 352.6 333.8 344.6 345.7 | Repeat 0: 339.5 mattop postprocessing 339.3
OOF Metric: 334.4 353.4 354.7 334.2 353.5 339.8 322.6 326.4 338.1 | Repeat 1: 339.7 mattop postprocessing 339.5
OOF Metric: 330.6 338.3 345.8 380.0 324.9 342.6 328.5 332.4 338.2 | Repeat 2: 340.1 mattop postprocessing 339.9
OOF Metric: 333.2 343.6 366.3 324.3 339.5 327.9 344.8 345.5 330.7 | Repeat 3: 339.5 mattop postprocessing 339.4
OOF Metric: 349.1 346.1 344.8 330.2 330.1 333.9 333.5 341.3 349.3 | Repeat 4: 339.8 mattop postprocessing 339.7
OOF Metric: 321.8 356.2 333.0 358.9 330.7 336.8 334.3 350.8 341.7 | Repeat 5: 340.5 mattop postprocessing 340.3
OOF Metric: 316.1 330.8 350.8 330.9 356.9 324.3 326.3 366.1 349.2 | Repeat 6: 339.1 mattop postprocessing 338.9
OOF Metric: 345.2 350.7 340.5 325.1 346.3 327.2 335.7 334.3 349.6 | Repeat 7: 339.4 mattop postprocessing 339.3
OOF Metric: 347.9 339.9 336.7 338.4 358.5 341.9 338.5 328.2 323.6 | Repeat 8: 339.3 mattop postprocessing 339.2
OOF Metric: 339.4 335.3 341.1 325.8 338.7 355.6 336.8 344.7 340.0 | Repeat 9: 339.7 mattop postprocessing 339.5


df_metrics = pd.DataFrame({'oof': oof_fold_metrics})
print(ggplot(df_metrics, aes(x='oof'))
    + theme_light()
    + theme(figure_size=(6, 3))
    + geom_dotplot(alpha=0.3, binwidth = 1, fill='SteelBlue')
    + geom_vline(xintercept=np.mean(oof_fold_metrics), linetype='dashed')
    + labs(
        y = '', x='', 
        title = f'OOF Metric Distribution\nMean MAE={np.mean(oof_fold_metrics):4.1f}'))

df_importance = pd.concat(fast_fis).sort_values(['Importance'])
if len(df_importance) > 0:
    df_importance_mean = (
        df_importance
        .groupby(['Feature'], as_index=False)
        .agg(mean_imp=('Importance', 'mean'), std_imp=('Importance', 'std'))
        .sort_values(['mean_imp']))

    feature_list = df_importance_mean['Feature'].tolist()
    feature_cat  = pd.Categorical(df_importance_mean['Feature'], categories=feature_list) #[::-1])
    
    df_importance_mean = (df_importance_mean
        .assign(
            feature_cat = feature_cat,
            mean_imp_min = lambda x: x['mean_imp'] - 2 * x['std_imp'],
            mean_imp_max = lambda x: x['mean_imp'] + 2 * x['std_imp']))
    
    df_importance = df_importance.merge(df_importance_mean.filter(['Feature', 'feature_cat']))

    print(ggplot(df_importance, aes(y='feature_cat', x='Importance')) 
        + theme_light()
        + theme(figure_size=(6, 4))
        #+ geom_jitter(alpha=0.2, size=0.5, height=0.1)
        + geom_point(mapping=aes(x='mean_imp'), colour='SteelBlue', data=df_importance_mean) 
        #+ geom_errorbarh(
        #    mapping=aes(x='mean_imp', xmin='mean_imp_min', xmax='mean_imp_max'), 
        #    alpha=0.5,  data=df_importance_mean)  
        + labs(
            y = '', x='', 
            title = (f'Permutation Feature Importance\n'
                f'oof metric={np.mean(oof_fold_metrics):4.1f}'))
        #+ facet_wrap('estimator_name', ncol=2)
         )
    
if len(df_importance) > 0:
    print(df_importance_mean.filter(['Feature', 'mean_imp', 'std_imp']).head(20))

            Feature  mean_imp   std_imp
2           bumbles  0.000422  0.000389
3         clonesize  0.000948  0.000684
1       RainingDays  0.003067  0.000931
0  MinOfLowerTRange  0.006432  0.001249
4         fruitmass  0.012293  0.002582
6             seeds  0.528243  0.030693
5          fruitset  0.773959  0.051585


SEED = 1984
target = "yield"

class PLSRegressionWrapper(PLSRegression):
    def transform(self, X):
        return super().transform(X)
    def fit_transform(self, X, Y):
        return self.fit(X,Y).transform(X)


train =  pd.read_csv('/kaggle/input/playground-series-s3e14/train.csv', index_col = "id")
test =  pd.read_csv('/kaggle/input/playground-series-s3e14/test.csv', index_col = "id")
origin = pd.read_csv('../input/wild-blueberry-yield-prediction-dataset/WildBlueberryPollinationSimulationData.csv', index_col = "Row#")
origin.index.name = "id"

train.loc[train["RainingDays"]==26, "RainingDays"] = 24
test.loc[test["RainingDays"]==33, "RainingDays"] = 34

for df in [train, test, origin]:
    df.loc[df["MaxOfUpperTRange"].isin([71.9, 79, 89]), "MaxOfUpperTRange"] = 86

for df in [train, test, origin]:
    display(df["RainingDays"].value_counts())
    display(df["MaxOfUpperTRange"].value_counts())

16.00    4361
24.00    3838
34.00    3521
1.00     3521
3.77       48
Name: RainingDays, dtype: int64

86.0    4203
77.4    3788
94.6    3734
69.7    3564
Name: MaxOfUpperTRange, dtype: int64

16.00    2831
24.00    2534
1.00     2438
34.00    2352
3.77       39
Name: RainingDays, dtype: int64

86.0    2730
77.4    2594
94.6    2532
69.7    2338
Name: MaxOfUpperTRange, dtype: int64

16.00    194
1.00     192
24.00    188
34.00    187
3.77      16
Name: RainingDays, dtype: int64

86.0    214
94.6    194
77.4    188
69.7    181
Name: MaxOfUpperTRange, dtype: int64


class MyFeaturesEngineering(BaseEstimator, TransformerMixin):
    
    def __init__(self, verbose = False, 
                 feats_for_pca = ["seeds", "fruitmass", "fruitset"], 
                 feats_for_pls = ["seeds", "fruitmass", "fruitset"]):
        
        self.verbose = verbose
        
        self.pca_components = len(feats_for_pca)
        self.feats_for_pca = feats_for_pca
        
        self.pls_components = len(feats_for_pls)
        self.feats_for_pls = feats_for_pls
        

    def fit(self, x, y=None):
        
        self.pipe_pca = make_pipeline(StandardScaler(), PCA(n_components = self.pca_components))
        self.pipe_pca.fit(x[self.feats_for_pca])
        
        self.pipe_pls = make_pipeline(StandardScaler(), PLSRegressionWrapper(n_components = self.pls_components))
        self.pipe_pls.fit(x[self.feats_for_pls], x[target])
        
        return self
    
    def transform(self, x, y=None):
        
        df = x.copy()
        
        pca_cols = [f"pca_{i}" for i in range(self.pca_components)]
        df[pca_cols] = self.pipe_pca.transform(df[self.feats_for_pca])
        
        pls_cols = [f"pls_{i}" for i in range(self.pls_components)]
        df[pls_cols] = self.pipe_pls.transform(df[self.feats_for_pls])
        
        return df


def fit_lgbm(params={}, features = None, use_original = False,
            n_splits = 5, seed = SEED, verbose = False,
             feats_for_pca = ["seeds", "fruitmass", "fruitset"], 
                 feats_for_pls = ["seeds", "fruitmass", "fruitset"]
            ):
    
    if features is None:
        features = test.columns
    best_iteration, val_score, trn_score = [], [], []
    eval_result={}
    callbacks = [lgb.early_stopping(100), lgb.record_evaluation(eval_result)]
    if verbose:
        callbacks.append(lgb.log_evaluation(200))
        
    oof = pd.Series(0, index=train.index)
    y_pred = pd.Series(0, index=test.index)
   
    # CV loop...
    folds = KFold(n_splits = n_splits, shuffle = True, random_state = seed)
    for fold, (trn_idx, val_idx) in enumerate(folds.split(train, train[target])):
        
        fe = MyFeaturesEngineering(feats_for_pca = feats_for_pca, feats_for_pls = feats_for_pls)
        if use_original: # Concat train and origin datasets for fitting
            X_trn = fe.fit_transform(pd.concat([train.loc[trn_idx], origin], axis=0))[features]
            y_trn = pd.concat([train.loc[trn_idx], origin], axis=0)[target]
        else:            # and don't
            X_trn = fe.fit_transform(train.loc[trn_idx])[features]
            y_trn = train.loc[trn_idx, target]

        # Don't use original dataset to fit
        X_val, y_val = fe.transform(train.loc[val_idx])[features], train.loc[val_idx, target]
        
        X_pred = fe.transform(test)[features]
        
        dtrn = lgb.Dataset(X_trn, label = y_trn) 
        dval = lgb.Dataset(X_val, label = y_val)

        # TRAIN MODEL        
        model = lgb.train(params, dtrn, num_boost_round = 2000, valid_sets = dval, callbacks = callbacks)#, categorical_feature = "auto")
        best_iteration.append(model.best_iteration)
        
        # Results, score
        oof.loc[val_idx] = model.predict(X_val, num_iteration = best_iteration[fold])
        y_pred += model.predict(X_pred, num_iteration = best_iteration[fold]) / n_splits
        val_score.append(mean_absolute_error(y_val, oof.loc[val_idx]))
        trn_score.append(mean_absolute_error(y_trn, model.predict(X_trn)))
        if verbose:
            print(f'Fold {fold + 1} - Valid : {val_score[fold]:.5f} - Train : {trn_score[fold]:.5f} - Best it. : {best_iteration[fold]:4}')
        
    # End CV Loop
    print(f"OOF MAE : {mean_absolute_error(train[target], oof):.5f} - Mean MAE {np.mean(val_score):.5f} - Overfeat {np.mean(trn_score) - np.mean(val_score):.5f} - Std {np.std(val_score):.5f}")
    
    return {"oof":oof, "oof_score":mean_absolute_error(train[target], oof), "overfeat":np.mean(trn_score) - np.mean(val_score),
           "best_iteration":best_iteration, "y_pred":y_pred}


params = {
    'objective':'regression_l1',
#    'num_leaves': 31, # Default : 31
#    'min_child_samples': 20, # Default : 20
    'learning_rate': 0.04, 
    'max_bin': 1000, # Default : 255
    'colsample_bytree': .8, # Default : 1
    'subsample': .7, # Default : 1
    'bagging_freq': 1,
    'random_state': SEED,
    'verbose':-1,
}

res1 = fit_lgbm(params, n_splits = 10, seed = SEED, verbose = True, use_original = True, 
               feats_for_pca = ['fruitset', 'seeds', 'fruitmass'], 
              features=["pca_0", "pca_1", 'RainingDays', 'fruitmass', 'MaxOfUpperTRange', 'fruitset', 'seeds'])


print(f"OOF MAE : {res1['oof_score']:.5f}")

Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 334.963
[400]	valid_0's l1: 332.873
[600]	valid_0's l1: 332.636
Early stopping, best iteration is:
[558]	valid_0's l1: 332.454
Fold 1 - Valid : 332.45375 - Train : 291.47038 - Best it. :  558
[200]	valid_0's l1: 341.906
[400]	valid_0's l1: 339.57
[600]	valid_0's l1: 339.294
Early stopping, best iteration is:
[558]	valid_0's l1: 332.454
Fold 2 - Valid : 339.11370 - Train : 290.55379 - Best it. :  558
[200]	valid_0's l1: 341.371
[400]	valid_0's l1: 340.109
[600]	valid_0's l1: 339.544
Early stopping, best iteration is:
[558]	valid_0's l1: 332.454
Fold 3 - Valid : 339.51066 - Train : 291.34753 - Best it. :  558
[200]	valid_0's l1: 325.544
[400]	valid_0's l1: 324.174
Early stopping, best iteration is:
[426]	valid_0's l1: 323.828
Fold 4 - Valid : 323.82754 - Train : 298.33966 - Best it. :  426
[200]	valid_0's l1: 341.359
[400]	valid_0's l1: 339.708
Early stopping, best iteration is:
[426]	valid_0's l1: 323.828
Fold 5 - Valid : 339.59345 - Train : 296.70489 - Best it. :  426
[200]	valid_0's l1: 328.306
[400]	valid_0's l1: 326.873
Early stopping, best iteration is:
[426]	valid_0's l1: 323.828
Fold 6 - Valid : 326.73065 - Train : 297.63902 - Best it. :  426
[200]	valid_0's l1: 353.306
[400]	valid_0's l1: 351.549
Early stopping, best iteration is:
[426]	valid_0's l1: 323.828
Fold 7 - Valid : 351.39655 - Train : 295.60687 - Best it. :  426
[200]	valid_0's l1: 352.305
[400]	valid_0's l1: 350.92
Early stopping, best iteration is:
[426]	valid_0's l1: 323.828
Fold 8 - Valid : 350.75259 - Train : 295.17723 - Best it. :  426
[200]	valid_0's l1: 337.706
[400]	valid_0's l1: 337.02
Early stopping, best iteration is:
[426]	valid_0's l1: 323.828
Fold 9 - Valid : 336.94442 - Train : 296.70300 - Best it. :  426
[200]	valid_0's l1: 352.177
[400]	valid_0's l1: 351.033
Early stopping, best iteration is:
[426]	valid_0's l1: 323.828
Fold 10 - Valid : 350.87840 - Train : 295.25871 - Best it. :  426
OOF MAE : 339.11940 - Mean MAE 339.12017 - Overfeat -44.24006 - Std 9.28006
OOF MAE : 339.11940


res2 = fit_lgbm(params, n_splits = 10, seed = SEED, verbose = True, use_original = True, 
               feats_for_pca = ['fruitset', 'seeds', 'fruitmass'], feats_for_pls = ['fruitset', 'seeds'], 
              features=["pls_0", "pca_0", "pca_1", 'RainingDays', 'fruitmass', 'MaxOfUpperTRange', 'fruitset', 'seeds'])

Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 336.26
[400]	valid_0's l1: 333.867
[600]	valid_0's l1: 333.336
Early stopping, best iteration is:
[625]	valid_0's l1: 333.145
Fold 1 - Valid : 333.14507 - Train : 288.53947 - Best it. :  625
[200]	valid_0's l1: 342.446
[400]	valid_0's l1: 340.551
[600]	valid_0's l1: 339.994
Early stopping, best iteration is:
[625]	valid_0's l1: 333.145
Fold 2 - Valid : 340.11188 - Train : 287.22511 - Best it. :  625
[200]	valid_0's l1: 340.42
[400]	valid_0's l1: 338.614
[600]	valid_0's l1: 338.337
Early stopping, best iteration is:
[625]	valid_0's l1: 333.145
Fold 3 - Valid : 338.38831 - Train : 288.82153 - Best it. :  625
[200]	valid_0's l1: 324.87
[400]	valid_0's l1: 323.13
[600]	valid_0's l1: 322.66
Early stopping, best iteration is:
[670]	valid_0's l1: 322.221
Fold 4 - Valid : 322.22086 - Train : 288.62732 - Best it. :  670
[200]	valid_0's l1: 341.975
[400]	valid_0's l1: 340.977
[600]	valid_0's l1: 341.027
Early stopping, best iteration is:
[670]	valid_0's l1: 322.221
Fold 5 - Valid : 341.14643 - Train : 285.99289 - Best it. :  670
[200]	valid_0's l1: 328.21
[400]	valid_0's l1: 326.87
[600]	valid_0's l1: 326.8
Early stopping, best iteration is:
[670]	valid_0's l1: 322.221
Fold 6 - Valid : 326.84320 - Train : 286.93019 - Best it. :  670
[200]	valid_0's l1: 355.352
[400]	valid_0's l1: 353.402
[600]	valid_0's l1: 352.504
Early stopping, best iteration is:
[670]	valid_0's l1: 322.221
Fold 7 - Valid : 352.27218 - Train : 286.13865 - Best it. :  670
[200]	valid_0's l1: 353.562
[400]	valid_0's l1: 351.897
[600]	valid_0's l1: 351.781
Early stopping, best iteration is:
[670]	valid_0's l1: 322.221
Fold 8 - Valid : 351.17550 - Train : 285.47058 - Best it. :  670
[200]	valid_0's l1: 337.347
[400]	valid_0's l1: 336.056
[600]	valid_0's l1: 335.165
Early stopping, best iteration is:
[670]	valid_0's l1: 322.221
Fold 9 - Valid : 335.34996 - Train : 286.05123 - Best it. :  670
[200]	valid_0's l1: 353.14
[400]	valid_0's l1: 353.057
[600]	valid_0's l1: 352.275
Early stopping, best iteration is:
[670]	valid_0's l1: 322.221
Fold 10 - Valid : 352.09652 - Train : 284.44975 - Best it. :  670
OOF MAE : 339.27415 - Mean MAE 339.27499 - Overfeat -52.45032 - Std 9.89093


oofs = pd.DataFrame(index=train.index)
oofs = pd.concat([oofs, 
                  pd.Series(oof_preds[:train.shape[0]], index=train.index), 
                  res1["oof"], 
                  res2["oof"]], axis=1)
oofs.columns=["Patrick", "Alex1", "Alex2"]

preds = pd.DataFrame(index=test.index)
preds = pd.concat([preds, 
                  pd.Series(test_preds, index=test.index), 
                  res1["y_pred"], 
                  res2["y_pred"]], axis=1)
preds.columns=["Patrick", "Alex1", "Alex2"]
display(oofs)
display(preds)


print(f"MAE with mean of 3 OOFS predictions : {mean_absolute_error(train[target], oofs.mean(axis=1))}")

MAE with mean of 3 OOFS predictions : 337.32195548473095


ridge_blend = Ridge(positive = True)
ridge_blend.fit(oofs, train[target])
print(f"MAE with Ridge Regression of 3 OOFS predictions : {mean_absolute_error(train[target], ridge_blend.predict(oofs))}\n\nCoefficients :")
display(pd.Series(ridge_blend.coef_.round(2), oofs.columns, name='weight'))

MAE with Ridge Regression of 3 OOFS predictions : 340.208877004165

Coefficients :

Patrick    0.69
Alex1      0.19
Alex2      0.09
Name: weight, dtype: float64


LADRegression_blend = LADRegression(positive = True)
LADRegression_blend.fit(oofs, train[target])
print(f"MAE with LAD Regression of 3 OOFS predictions : {mean_absolute_error(train[target], LADRegression_blend.predict(oofs))}\n\nCoefficients :")
display(pd.Series(LADRegression_blend.coef_.round(2), oofs.columns, name='weight'))

MAE with LAD Regression of 3 OOFS predictions : 336.86184680429363

Coefficients :

Patrick    0.65
Alex1      0.11
Alex2      0.24
Name: weight, dtype: float64


result_df = pd.Series([
    mean_absolute_error(train[target], oofs["Patrick"]),
    mean_absolute_error(train[target], oofs["Alex1"]),
    mean_absolute_error(train[target], oofs["Alex2"]),
    mean_absolute_error(train[target], oofs.mean(axis=1)),
    mean_absolute_error(train[target], ridge_blend.predict(oofs)),
    mean_absolute_error(train[target], LADRegression_blend.predict(oofs)),
    ], index = ["Patrick", "Alex1", "Alex2", "Blend_Mean", "Blend_Ridge", "Blend_LAD"], name="MAE")
result_df

Patrick        337.477006
Alex1          339.119403
Alex2          339.274151
Blend_Mean     337.321955
Blend_Ridge    340.208877
Blend_LAD      336.861847
Name: MAE, dtype: float64


bars = plt.barh(result_df.index, result_df, color=["yellow", "yellow", "yellow", "brown", "brown", "green"])
plt.gca().bar_label(bars, fmt='%.2f')
plt.gca().invert_yaxis()
plt.yticks(np.arange(len(result_df)), result_df.index)
plt.xlabel('MAE')
plt.xlim(336, 341);


# Define the hyperparameters to search
params = {
    'alpha': [0.5, 1, 0.6, 0.7, 0.75, 0.8, 0.9, 0.85],
    'l1_ratio': [0.15, 0.2, 0.25, 0.3],
}

# Create an instance of LADRegression
reg = LADRegression(positive=True)

# Create a grid search object
grid_search = GridSearchCV(reg, params, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Fit the grid search object to the training data
grid_search.fit(oofs, train[target])

# Print the best hyperparameters and the corresponding score
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", -grid_search.best_score_)

Best hyperparameters: {'alpha': 0.8, 'l1_ratio': 0.15}
Best score: 337.0114471951213


LADRegression_blend = LADRegression(alpha=0.8, l1_ratio=0.15,fit_intercept=True, positive=True)
LADRegression_blend.fit(oofs, train[target])
sub = pd.DataFrame(index=test.index)
sub["yield"] = LADRegression_blend.predict(preds)
sub["yield"].to_csv("submission.csv")

	count	mean	std	min	25%	50%	75%	max
MinOfLowerTRange	16066.0	28.661988	3.195960	24.300000	27.000000	30.000000	30.000000	33.000000
RainingDays	16066.0	18.643737	11.680365	1.000000	16.000000	16.000000	24.000000	34.000000
fruitmass	16066.0	0.446525	0.037200	0.311921	0.418867	0.446570	0.474777	0.535660
fruitset	16066.0	0.502711	0.074639	0.192732	0.457833	0.506686	0.560709	0.652144
seeds	16066.0	36.162894	4.048407	22.079199	33.232449	36.047770	39.158238	46.585105
bumbles	16066.0	0.286556	0.060249	0.000000	0.250000	0.250000	0.380000	0.585000
clonesize	16066.0	19.659374	6.618134	10.000000	12.500000	25.000000	25.000000	40.000000

	Patrick	Alex1	Alex2
id
0	4524.715276	4536.927508	4559.274255
1	4992.224168	4924.787941	4929.542671
2	6627.552171	6667.346578	6700.989518
3	6897.276842	6864.216806	6870.509389
4	7088.545690	7225.271564	7243.258630
...	...	...	...
15284	7205.011832	7287.927125	7299.634547
15285	3477.897034	3604.651889	3612.157124
15286	4528.379278	4540.974485	4581.256973
15287	6535.315979	6708.789779	6694.445535
15288	5855.005176	5780.637467	5794.202996

	Patrick	Alex1	Alex2
id
15289	4314.075337	4315.208765	4318.745770
15290	5871.467491	5883.606886	5915.360576
15291	7278.301828	7228.281588	7237.722345
15292	4494.747084	4615.898335	4586.408810
15293	3842.376552	3835.663093	3833.802631
...	...	...	...
25478	5404.957437	5437.875677	5411.124691
25479	5608.263165	5615.289054	5616.039729
25480	6491.763603	6479.994065	6477.833953
25481	4422.558822	4407.444083	4410.990953
25482	7185.892527	7204.911149	7193.700298

Prediction of Wild Blueberry Yield | Kaggle Competition | Top 1.5% Leaderboard¶

Introduction¶

Import required libraries¶

Data Preprocessing¶

Exploratory Analysis¶

Data Modeling¶

Flaml¶

Train AutoML Model¶

Model Performance¶

LGBM¶

Preprocressing for LGBM¶

Feature Engineering for LGBM¶

Train LGBM¶

Stacking with Least Absolute Deviation (LAD) Regression¶

Concat predictions in a dataset¶

MAE of average predictions¶

MAE with Ridge on predictions¶

LAD Regression on concatenate dataset¶

Create new dataset with new predictions¶

MAE graph visualization¶

Stacked LAD Regression¶

Submission¶