# Import data
import pandas as pd
recipes = pd.read_csv('recipe_site_traffic_2212.csv')
recipes.head(5)


# Check null values
recipes.isna().sum()

recipe            0
calories         52
carbohydrate     52
sugar            52
protein          52
category          0
servings          0
high_traffic    373
dtype: int64


recipes.dtypes

recipe            int64
calories        float64
carbohydrate    float64
sugar           float64
protein         float64
category         object
servings         object
high_traffic     object
dtype: object


# Impute missing data to numerical columns
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy='mean')

# Calories
recipes[['calories']] = num_imputer.fit_transform(recipes[['calories']])

# Carbohydrate
recipes[['carbohydrate']] = num_imputer.fit_transform(recipes[['carbohydrate']])

# Sugar
recipes[['sugar']] = num_imputer.fit_transform(recipes[['sugar']])

# Protein
recipes[['protein']] = num_imputer.fit_transform(recipes[['protein']])

# Check imputation
recipes[['calories', 'carbohydrate', 'sugar', 'protein']].isna().sum()

calories        0
carbohydrate    0
sugar           0
protein         0
dtype: int64


# Check consistency of column category
recipes.category.unique()

array(['Pork', 'Potato', 'Breakfast', 'Beverages', 'One Dish Meal',
       'Chicken Breast', 'Lunch/Snacks', 'Chicken', 'Vegetable', 'Meat',
       'Dessert'], dtype=object)


# Servings
recipes.servings.unique()
recipes.servings = recipes.servings.replace({'4 as a snack': '4', '6 as a snack': '6'})
recipes.servings = recipes.servings.apply(int)
recipes.servings.unique()

array([6, 4, 1, 2])


# High traffic
recipes.high_traffic.unique()
recipes.high_traffic.fillna('Low', inplace=True)
recipes.high_traffic = recipes.high_traffic.replace({'High': 1, 'Low': 0})
recipes.high_traffic.unique()

array([1, 0])


# Create a heatmap of the correlation matrix
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,10))

cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(recipes[1:].corr(), annot=True, vmax=1, vmin=-1, center=0, cmap=cmap)
plt.show()


sns.set_style("darkgrid")

def plot_hist(col, bins=20):
    sns.distplot(col, bins=bins)
    plt.show()

plot_hist(recipes.calories)


def plot_boxplot(col, showfliers=False, whis=[5, 95]):
    sns.boxplot(col, showfliers=showfliers, whis=whis)
    ax = plt.gca()
    
    # Add quartile value annotations
    q_05 = col.quantile(0.05)
    q_25 = col.quantile(0.25)
    q_50 = col.median()
    q_75 = col.quantile(0.75)
    q_95 = col.quantile(0.95)

    for i, quartile in enumerate([q_05, q_25, q_50, q_75, q_95]):
        ax.text(0.45+i*0.12*0.5, quartile, f'Q{i+1}\n{quartile:.3f}', ha='center', va='center', fontweight='bold')
    plt.show()

plot_boxplot(recipes.calories)


# Binning into categories
import numpy as np

def bin_plot_col(data, col, new_col_name, q_05=True, q_25=True):
    
    if q_05 is False and q_25 is False:
        q_50 = data[col].median()
        q_75 = data[col].quantile(0.75)
        q_95 = data[col].quantile(0.95)
        bins = [0, q_50, q_75, q_95, np.inf]
        data[new_col_name] = pd.cut(data[col],
                                    bins=bins,
                                    labels=[1,2,3,4])


        
    elif q_05 is True and q_25 is False:
        q_05 = data[col].quantile(0.05)
        q_50 = data[col].median()
        q_75 = data[col].quantile(0.75)
        q_95 = data[col].quantile(0.95)
        bins = [0, q_05, q_50, q_75, q_95, np.inf]
        data[new_col_name] = pd.cut(data[col],
                                    bins=bins,
                                    labels=[1,2,3,4,5])

        
    elif q_05 is False and q_25 is True:
        q_25 = data[col].quantile(0.25)
        q_50 = data[col].median()
        q_75 = data[col].quantile(0.75)
        q_95 = data[col].quantile(0.95)
        bins = [0, q_25, q_50, q_75, q_95, np.inf]
        data[new_col_name] = pd.cut(data[col],
                                    bins=bins,
                                    labels=[1,2,3,4,5])

        
    else:
        q_05 = data[col].quantile(0.05)
        q_25 = data[col].quantile(0.25)
        q_50 = data[col].median()
        q_75 = data[col].quantile(0.75)
        q_95 = data[col].quantile(0.95)
        bins = [0, q_05, q_25, q_50, q_75, q_95, np.inf]
        data[new_col_name] = pd.cut(data[col],
                                    bins=bins,
                                    labels=[1,2,3,4,5,6])

    sns.distplot(data[new_col_name])
    plt.show()

bin_plot_col(recipes, 'calories', 'binned_calories')


plot_hist(recipes.carbohydrate)


plot_boxplot(recipes.carbohydrate)


# Binning into categories
bin_plot_col(recipes, 'carbohydrate', 'binned_carbohydrate')


plot_hist(recipes.sugar, bins=10)


plot_boxplot(recipes.sugar)


bin_plot_col(recipes, 'sugar', 'binned_sugar', q_05=False)


plot_hist(recipes.protein)


plot_boxplot(recipes.protein)


bin_plot_col(recipes, 'protein', 'binned_protein', q_05=False)


recipes.category.value_counts().plot(kind='barh')

<AxesSubplot: >


recipes.servings.value_counts().plot(kind='bar')

<AxesSubplot: >


recipes['fat'] = (recipes.calories - recipes.carbohydrate * 4 - recipes.protein * 4) / 9
recipes.fat = recipes.fat.apply(lambda x: 0 if x < 0 else x)

recipes['calories_per_serving'] = recipes.calories / recipes.servings

recipes['carb_percent'] = recipes.carbohydrate / (recipes.carbohydrate + recipes.sugar + recipes.protein + recipes.fat)
recipes['sugar_percent'] = recipes.sugar / (recipes.carbohydrate + recipes.sugar + recipes.protein + recipes.fat)
recipes['protein_percent'] = recipes.protein / (recipes.carbohydrate + recipes.sugar + recipes.protein + recipes.fat)
recipes['fat_percent'] = recipes.fat / (recipes.carbohydrate + recipes.sugar + recipes.protein + recipes.fat)

recipes['carb_calories_percent'] = recipes.carbohydrate*0.4 / recipes.calories
recipes['sugar_calories_percent'] = recipes.sugar*0.4 / recipes.calories
recipes['protein_calories_percent'] = recipes.protein*0.4 / recipes.calories
recipes['fat_calories_percent'] = recipes.fat*0.9 / recipes.calories


recipes[['fat','calories_per_serving', 'carb_percent', 'sugar_percent', 'protein_percent', 'fat_percent', 'carb_calories_percent', 'sugar_calories_percent', 'protein_calories_percent', 'fat_calories_percent']].head()


plot_hist(recipes.fat)


plot_boxplot(recipes.fat)


bin_plot_col(recipes, 'fat', 'binned_fat', q_05=False, q_25=False)


## Calories Per Serving
plot_hist(recipes.calories_per_serving)


plot_boxplot(recipes.calories_per_serving)


bin_plot_col(recipes, 'calories_per_serving', 'binned_calories_per_serving')


plot_hist(recipes.carb_percent, bins=15)


plot_boxplot(recipes.carb_percent)


bin_plot_col(recipes, 'carb_percent', 'binned_carb_percent')


plot_hist(recipes.sugar_percent)


plot_boxplot(recipes.sugar_percent)


bin_plot_col(recipes, 'sugar_percent', 'binned_sugar_percent', q_05=False, q_25=False)


plot_hist(recipes.protein_percent)


plot_boxplot(recipes.protein_percent)


bin_plot_col(recipes, 'protein_percent', 'binned_protein_percent', q_05=False, q_25=False)


plot_hist(recipes.fat_percent)


plot_boxplot(recipes.fat_percent)


bin_plot_col(recipes, 'fat_percent', 'binned_fat_percent', q_05=False, q_25=False)


plot_hist(recipes.carb_calories_percent, bins=40)


plot_boxplot(recipes.carb_calories_percent)


bin_plot_col(recipes, 'carb_calories_percent', 'binned_carb_calories_percent', q_05=False, q_25=False)


plot_hist(recipes.sugar_calories_percent)


plot_boxplot(recipes.sugar_calories_percent)


bin_plot_col(recipes, 'sugar_calories_percent', 'binned_sugar_calories_percent', q_05=False, q_25=False)


plot_hist(recipes.protein_calories_percent, bins=30)


plot_boxplot(recipes.protein_calories_percent)


bin_plot_col(recipes, 'protein_calories_percent', 'binned_protein_calories_percent', q_05=False, q_25=False)


plot_hist(recipes.fat_calories_percent)


plot_boxplot(recipes.fat_calories_percent)


bin_plot_col(recipes, 'fat_calories_percent', 'binned_fat_calories_percent', q_05=False, q_25=False)


data = pd.get_dummies(recipes, columns=['category', 'servings', 'binned_calories', 'binned_carbohydrate', 
                                        'binned_sugar', 'binned_protein', 'binned_fat',
                                       'binned_carb_percent', 'binned_sugar_percent', 'binned_protein_percent',
                                       'binned_fat_percent', 'binned_carb_calories_percent',
                                       'binned_sugar_calories_percent', 'binned_protein_calories_percent',
                                       'binned_fat_calories_percent'], drop_first=True)
data.drop(['recipe','calories', 'carbohydrate', 'sugar', 'protein', 'fat',
          'calories_per_serving', 'carb_percent', 'sugar_percent',
          'protein_percent', 'fat_percent', 'carb_calories_percent',
          'sugar_calories_percent', 'protein_calories_percent', 'fat_calories_percent',
          'binned_calories_per_serving'], axis=1, inplace=True)
data.columns.to_list()

['high_traffic',
 'category_Breakfast',
 'category_Chicken',
 'category_Chicken Breast',
 'category_Dessert',
 'category_Lunch/Snacks',
 'category_Meat',
 'category_One Dish Meal',
 'category_Pork',
 'category_Potato',
 'category_Vegetable',
 'servings_2',
 'servings_4',
 'servings_6',
 'binned_calories_2',
 'binned_calories_3',
 'binned_calories_4',
 'binned_calories_5',
 'binned_calories_6',
 'binned_carbohydrate_2',
 'binned_carbohydrate_3',
 'binned_carbohydrate_4',
 'binned_carbohydrate_5',
 'binned_carbohydrate_6',
 'binned_sugar_2',
 'binned_sugar_3',
 'binned_sugar_4',
 'binned_sugar_5',
 'binned_protein_2',
 'binned_protein_3',
 'binned_protein_4',
 'binned_protein_5',
 'binned_fat_2',
 'binned_fat_3',
 'binned_fat_4',
 'binned_carb_percent_2',
 'binned_carb_percent_3',
 'binned_carb_percent_4',
 'binned_carb_percent_5',
 'binned_carb_percent_6',
 'binned_sugar_percent_2',
 'binned_sugar_percent_3',
 'binned_sugar_percent_4',
 'binned_protein_percent_2',
 'binned_protein_percent_3',
 'binned_protein_percent_4',
 'binned_fat_percent_2',
 'binned_fat_percent_3',
 'binned_fat_percent_4',
 'binned_carb_calories_percent_2',
 'binned_carb_calories_percent_3',
 'binned_carb_calories_percent_4',
 'binned_sugar_calories_percent_2',
 'binned_sugar_calories_percent_3',
 'binned_sugar_calories_percent_4',
 'binned_protein_calories_percent_2',
 'binned_protein_calories_percent_3',
 'binned_protein_calories_percent_4',
 'binned_fat_calories_percent_2',
 'binned_fat_calories_percent_3',
 'binned_fat_calories_percent_4']


# Tran-test split
from sklearn.model_selection import train_test_split
X = data.drop(['high_traffic'], axis=1)
y = data['high_traffic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)


# Genetic Testing
from tpot import TPOTClassifier
tpot = TPOTClassifier(generations=10, population_size=5,
                     verbosity=2, offspring_size=5,
                     scoring='accuracy', cv=5)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

Optimization Progress:   0%|          | 0/55 [00:00<?, ?pipeline/s]

Generation 1 - Current best internal CV score: 0.6943661971830986

Generation 2 - Current best internal CV score: 0.6943661971830986

Generation 3 - Current best internal CV score: 0.7352112676056338

Generation 4 - Current best internal CV score: 0.7352112676056338

Generation 5 - Current best internal CV score: 0.7352112676056338

Generation 6 - Current best internal CV score: 0.7352112676056338

Generation 7 - Current best internal CV score: 0.7352112676056338

Generation 8 - Current best internal CV score: 0.7352112676056338

Generation 9 - Current best internal CV score: 0.7352112676056338

Generation 10 - Current best internal CV score: 0.7352112676056338

Best pipeline: ExtraTreesClassifier(SGDClassifier(input_matrix, alpha=0.01, eta0=0.01, fit_intercept=True, l1_ratio=0.5, learning_rate=constant, loss=hinge, penalty=elasticnet, power_t=0.0), bootstrap=True, criterion=entropy, max_features=0.2, min_samples_leaf=7, min_samples_split=2, n_estimators=100)
0.7890295358649789


tpot.fitted_pipeline_

Pipeline(steps=[('stackingestimator',
                 StackingEstimator(estimator=SGDClassifier(alpha=0.01,
                                                           eta0=0.01,
                                                           l1_ratio=0.5,
                                                           learning_rate='constant',
                                                           penalty='elasticnet',
                                                           power_t=0.0))),
                ('extratreesclassifier',
                 ExtraTreesClassifier(bootstrap=True, criterion='entropy',
                                      max_features=0.2, min_samples_leaf=7))])

Pipeline(steps=[('stackingestimator',
                 StackingEstimator(estimator=SGDClassifier(alpha=0.01,
                                                           eta0=0.01,
                                                           l1_ratio=0.5,
                                                           learning_rate='constant',
                                                           penalty='elasticnet',
                                                           power_t=0.0))),
                ('extratreesclassifier',
                 ExtraTreesClassifier(bootstrap=True, criterion='entropy',
                                      max_features=0.2, min_samples_leaf=7))])

StackingEstimator(estimator=SGDClassifier(alpha=0.01, eta0=0.01, l1_ratio=0.5,
                                          learning_rate='constant',
                                          penalty='elasticnet', power_t=0.0))

SGDClassifier(alpha=0.01, eta0=0.01, l1_ratio=0.5, learning_rate='constant',
              penalty='elasticnet', power_t=0.0)

SGDClassifier(alpha=0.01, eta0=0.01, l1_ratio=0.5, learning_rate='constant',
              penalty='elasticnet', power_t=0.0)

ExtraTreesClassifier(bootstrap=True, criterion='entropy', max_features=0.2,
                     min_samples_leaf=7)


# SGDClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import ExtraTreesClassifier, StackingClassifier
from sklearn.metrics import accuracy_score

base_estimator = SGDClassifier(
    alpha=0.01,
    eta0=0.01,
    fit_intercept=True,
    l1_ratio=0.5,
    learning_rate='constant',
    loss='hinge',
    penalty='elasticnet',
    power_t=0.0
)

estimators = [('base_estimator', base_estimator)]

clf = StackingClassifier(
    estimators=estimators,
    final_estimator=ExtraTreesClassifier(
        bootstrap=True,
        criterion='entropy',
        max_features=0.2,
        min_samples_leaf=7,
        min_samples_split=2,
        n_estimators=100,
        verbose=1
    ),
    verbose=1
)

# You can then fit the classifier on your input data using the `.fit()` method:
clf.fit(X_train, y_train)

# And make predictions on new data using the `.predict()` method:
y_pred_sgd = clf.predict(X_test)
y_train_sgd = clf.predict(X_train)

accuracy_sgd = accuracy_score(y_test, y_pred_sgd)

print("Accuracy on test data: {}%".format(accuracy_sgd.round(4)*100))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.

Accuracy on test data: 82.28%

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished


# Grid Search with SGDClassifier
from sklearn.model_selection import GridSearchCV

# Define the base estimator
base_estimator = SGDClassifier(
    alpha=0.01,
    eta0=0.01,
    fit_intercept=True,
    l1_ratio=0.5,
    learning_rate='constant',
    loss='hinge',
    penalty='elasticnet',
    power_t=0.0
)

# Define the estimators for the stacking classifier
estimators = [('base_estimator', base_estimator)]

# Define the stacking classifier
clf = StackingClassifier(
    estimators=estimators,
    final_estimator=ExtraTreesClassifier(
        bootstrap=True,
        criterion='entropy',
        max_features=0.2,
        min_samples_leaf=7,
        min_samples_split=2,
        n_estimators=100,
        verbose=1
    ),
    verbose=1
)

# Define the hyperparameter search space
param_grid = {
    'base_estimator__alpha': [0.005, 0.01, 0.1],
    'base_estimator__eta0': [0.005, 0.01, 0.1],
    'base_estimator__penalty': ['l1', 'l2', 'elasticnet'],
    'final_estimator__n_estimators': [50, 100, 150],
    'final_estimator__max_features': [0.15, 0.2, 0.25],
    'final_estimator__min_samples_leaf': [6, 7, 8]
}

# Define the GridSearchCV object
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best classifier and its parameters
best_clf = grid_search.best_estimator_
best_params = grid_search.best_params_
#accuracy_sgd_cv = best_clf.score(X_test, y_test)
y_pred_sgd_cv = best_clf.predict(X_test)
y_train_sgd_cv = best_clf.predict(X_train)
accuracy_sgd_cv = accuracy_score(y_test, y_pred_sgd_cv)

# Print the best parameters and the score of the best classifier on the test data
print('Best parameters:', best_params)
print('Accuracy score: {}%'.format(round(accuracy_sgd_cv, 4)*100))


# Train an XGBoost Classifier using the SGD optimization algorithm
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Instantiate XGBoost classifier with SGD optimization
clf = xgb.XGBClassifier(objective='binary:logistic', optimizer='sgd', eval_metric='error')

# Train classifier on the training data
clf.fit(X_train, y_train)

# Predict labels for the test data
y_pred_xgb = clf.predict(X_test)
y_train_xgb = clf.predict(X_train)

# Calculate accuracy score on the test data
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print("Accuracy on test data: {}%".format(round(accuracy_xgb, 4)*100))

[20:08:58] WARNING: ../src/learner.cc:627: 
Parameters: { "optimizer" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Accuracy on test data: 70.46%


#Cross validation in XGBoost
from sklearn.metrics import accuracy_score

# Create DMatrix object for training data
churn_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

# Set hyperparameters
params = {
    'objective': 'binary:logistic',
    'max_depth': 4,
    'learning_rate': 0.1,
    'gamma': 0.1,
    'reg_lambda': 1
}

# Perform cross-validation with early stopping
cv_results = xgb.cv(
    dtrain=churn_dmatrix, 
    params=params, 
    num_boost_round=1000, 
    early_stopping_rounds=100,
    nfold=4,
    metrics='error', 
    as_pandas=True,
    seed=1
)

# Print best accuracy score and number of boosting rounds
best_accuracy = 1 - cv_results['test-error-mean'].min()
best_rounds = cv_results['test-error-mean'].idxmin() + 1
print(f"Best score: {best_accuracy:.4f} (at round {best_rounds})")

Best score: 0.7253 (at round 97)


# Train XGBoost model on full training data using best number of boosting rounds
clf = xgb.train(params=params, dtrain=churn_dmatrix, num_boost_round=best_rounds)

# Predict labels for test data and calculate accuracy score
y_pred_xgb_cv = clf.predict(xgb.DMatrix(X_test))
y_pred_binary_xgb_cv = [1 if p >= 0.5 else 0 for p in y_pred_xgb_cv]
accuracy_xgb_cv = accuracy_score(y_test, y_pred_binary_xgb_cv)

y_train_xgb_cv = clf.predict(xgb.DMatrix(X_train))
y_train_binary_xgb_cv = [1 if p >= 0.5 else 0 for p in y_train_xgb_cv]

print(f"Accuracy on test data: {round(accuracy_xgb_cv*100, 2)}%")

Accuracy on test data: 75.95%


# Random Forest Classifier with CV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Define a pipeline with a StandardScaler and a RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Define a grid of hyperparameters to search over
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, None],
    'max_features': ['sqrt', 'log2']
}

# Perform grid search cross-validation with 4 folds
grid_search = GridSearchCV(
    estimator=rf, 
    param_grid=param_grid, 
    cv=4,
    scoring='accuracy',
    verbose=1
)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print best hyperparameters and accuracy score
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_
print(f"Best hyperparameters: {best_params}")
print(f"Accuracy on validation data: {best_accuracy:.4f}")

Fitting 4 folds for each of 18 candidates, totalling 72 fits
Best hyperparameters: {'max_depth': None, 'max_features': 'log2', 'n_estimators': 500}
Accuracy on validation data: 0.6635


# Train RandomForestClassifier on full training data using best hyperparameters
rf = RandomForestClassifier(random_state=42, **best_params)
rf.fit(X_train, y_train)

# Predict labels for test data and calculate accuracy score
y_pred_rf_cv = rf.predict(X_test)
y_train_rf_cv = rf.predict(X_train)
accuracy_rf_cv = accuracy_score(y_test, y_pred_rf_cv)
print(f"Accuracy on test data: {round(accuracy_rf_cv*100, 2)}%")

Accuracy on test data: 70.89%


# Model Stacking
X_train_stacked = pd.DataFrame({
    'sgd_pred': y_train_sgd,
    'sgd_cv_pred': y_train_sgd_cv,
    'xgb_pred': y_train_xgb_cv,
    'rf_cv_pred': y_train_rf_cv
})

X_test_stacked = pd.DataFrame({
    'sgd_pred': y_pred_sgd,
    'sgd_cv_pred': y_pred_sgd_cv,
    'xgb_pred': y_pred_xgb_cv,
    'rf_cv_pred': y_pred_rf_cv
})


# Logistic Regression w/ Model Stacking
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_stacked, y_train)
y_pred_lr = lr.predict(X_test_stacked)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(accuracy_lr)

0.7088607594936709


# Table of Accuracy results
results = pd.DataFrame({
    'Model': ['SGDClassifier', 'SGDClassifier w/ CV', 'XGBoostClassifier', 'XGBoostClassifier w/ CV', 'RandomForestClassifier w/ CV', 
              'Logistic RegressionStacked'],
    'Accuracy': [accuracy_sgd, accuracy_sgd_cv, accuracy_xgb, 
                 accuracy_xgb_cv, accuracy_rf_cv, accuracy_lr]
    })
plt.barh(data=results.sort_values(by='Accuracy'), y='Model', width='Accuracy')

<BarContainer object of 6 artists>

	recipe	calories	carbohydrate	sugar	protein	category	servings	high_traffic
0	1	NaN	NaN	NaN	NaN	Pork	6	High
1	2	35.48	38.56	0.66	0.92	Potato	4	High
2	3	914.28	42.68	3.09	2.88	Breakfast	1	NaN
3	4	97.03	30.56	38.63	0.02	Beverages	4	High
4	5	27.05	1.85	0.80	0.53	Beverages	4	NaN

	fat	calories_per_serving	carb_percent	sugar_percent	protein_percent	fat_percent	carb_calories_percent	sugar_calories_percent	protein_calories_percent	fat_calories_percent
0	22.118145	72.656533	0.388009	0.100091	0.267187	0.244714	0.032179	0.008301	0.022158	0.045663
1	0.000000	8.870000	0.960638	0.016442	0.022920	0.000000	0.434724	0.007441	0.010372	0.000000
2	81.337778	914.280000	0.328339	0.023771	0.022156	0.625734	0.018673	0.001352	0.001260	0.080067
3	0.000000	24.257500	0.441555	0.558156	0.000289	0.000000	0.125982	0.159250	0.000082	0.000000
4	1.947778	6.762500	0.360780	0.156013	0.103359	0.379848	0.027357	0.011830	0.007837	0.064806

Building a Recommendation System with Machine Learning¶

Data Validation¶

Exploratory Analysis & Feature Engineering¶

Calories¶

Carbohydrate¶

Sugar¶

Protein¶

Category¶

Servings¶

Creating new variables¶

Fat¶

Calories Per Serving¶

Percentage of Carbohydrates¶

Percentage of Sugar¶

Percentage of Protein¶

Percentage of Fat¶

Percentage of Calories from Carbohydrate¶

Percentage of Calories from Sugar¶

Percentage of Calories from Protein¶

Percentage of Calories from Fat¶

Dummy encoding¶

Model Development¶

Model Evaluation¶

Recommendations¶