import pandas as pd                 # Import the pandas library for data manipulation and analysis

root = 'input/'                     # Set the root directory for the data

# Load all the data from csv files into dataframes
orders = pd.read_csv(root + 'orders.csv')
order_products_train = pd.read_csv(root + 'order_products__train.csv')
order_products_prior = pd.read_csv(root + 'order_products__prior.csv')
products = pd.read_csv(root + 'products.csv')
aisles = pd.read_csv(root + 'aisles.csv')
departments = pd.read_csv(root + 'departments.csv')

# CAUTION: Long runtime (7+ minutes)
import requests
import pandas as pd

# USDA API URL and Key
api_url = "https://api.nal.usda.gov/fdc/v1/foods/search"
api_key = "jG5MlVf3ii60bSNwcQNcByXkedFgPgiVSCbmeemW"

# Function to fetch data from USDA API
def fetch_usda_data(query, api_key):
    params = {
        "query": query,
        "api_key": api_key,
        "dataType": ["Foundation"],
        "pageSize": 1
    }
    response = requests.get(api_url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Function to extract required nutrients from the response
def extract_nutrients(data):
    nutrients = {"Water": None, "Energy": None, "Protein": None, "Total lipid (fat)": None, "Carbohydrate, by difference": None}
    if data and 'foods' in data and len(data['foods']) > 0:
        food_data = data['foods'][0]
        for nutrient in food_data.get('foodNutrients', []):
            nutrient_name = nutrient.get('nutrientName')
            if nutrient_name in nutrients:
                nutrients[nutrient_name] = nutrient.get('value')
    return nutrients

# Load the CSV file
df = pd.read_csv('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/ReOrdersOnly/3ProductsNew.csv')

# Iterate over each product and fetch data
for index, row in df.iterrows():
    product_name = row['product_name']
    api_response = fetch_usda_data(product_name, api_key)
    nutrient_data = extract_nutrients(api_response)

    # Update the DataFrame with nutrient data
    for nutrient, value in nutrient_data.items():
        df.at[index, nutrient] = value

# Save the updated DataFrame to a new CSV file
df.to_csv('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/ReOrdersOnly/3ProductsNew.csv', index=False)

# Plot top products by Order Count
df = pd.read_csv('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/ReOrdersOnly/3ProductsNew.csv')
df.head(50).plot.bar(x='product_name', y='order_count', figsize=(20,10), title='Top 50 Products by Order Count')

<Axes: title={'center': 'Top 50 Products by Order Count'}, xlabel='product_name'>

# Plot top Aisles by Order Count
df = pd.read_csv('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/AislesNew.csv')
df.groupby('aisle')['order_count'].sum().sort_values(ascending=False)[:25].plot.bar(figsize=(10,5), title='Top Aisles by Order Count')

<Axes: title={'center': 'Top Aisles by Order Count'}, xlabel='aisle'>

# Plot top Departments by Order Count
df = pd.read_csv('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/DepartmentsNew.csv')
df.groupby('department')['order_count'].sum().sort_values(ascending=False)[:25].plot.bar(figsize=(10,5), title='Top Departments by Order Count')

<Axes: title={'center': 'Top Departments by Order Count'}, xlabel='department'>

# Products with highest reorder ratio
df = pd.read_csv('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/ProductsNew.csv')
# Print df of products with highest reorder ratio
df.sort_values(by='perc_reorder', ascending=False)[:25]

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/ReOrdersOnly/3ProductsNew.csv')

# Impute missing values for numerical columns
num_cols = ['Water', 'Energy', 'Protein', 'Total lipid (fat)', 'Carbohydrate, by difference']
imputer = SimpleImputer(strategy='mean')
df[num_cols] = imputer.fit_transform(df[num_cols])

# Standardize the numerical features
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Select only numerical columns for correlation
numerical_cols = df.select_dtypes(include=['float64', 'int64'])

# Calculate the correlation matrix
correlation_matrix = numerical_cols.corr()

# Visualize the correlation matrix
plt.figure(figsize=(10, 7))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix of Top 100 Most Popular Items")
plt.show()

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Prepare the predictors and target variable
X = df[['Water', 'Energy', 'Protein', 'Total lipid (fat)', 'Carbohydrate, by difference']]
y = df['order_count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

# Feature importance
importances = model.feature_importances_
feature_names = X.columns
feature_importances = pd.DataFrame(importances, index=feature_names, columns=['Importance']).sort_values('Importance', ascending=False)
print(feature_importances)

RMSE: 3929.8129637393286
                             Importance
Protein                        0.590304
Total lipid (fat)              0.123060
Carbohydrate, by difference    0.121033
Water                          0.113302
Energy                         0.052301

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

# Feature engineering (optional)
# df['reorder_ratio'] = df['reorder_count'] / df['order_count']

# Prepare the predictors and target variable
X = df.drop(['order_count', 'product_id', 'product_name'], axis=1)
y = df['order_count']

# One-hot encoding for categorical variables
categorical_features = ['aisle_id', 'department_id']
one_hot_encoder = ColumnTransformer(transformers=[('cat', OneHotEncoder(), categorical_features)], remainder='passthrough')

X = one_hot_encoder.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

# Feature importance (adjust according to the number of features after encoding)
feature_names = one_hot_encoder.get_feature_names_out()
feature_importances = pd.DataFrame(model.feature_importances_, index=feature_names, columns=['Importance']).sort_values('Importance', ascending=False)
print(feature_importances)

RMSE: 4778.204453522404
                                          Importance
remainder__cart_order                   3.765343e-01
remainder__Carbohydrate, by difference  1.883694e-01
remainder__Total lipid (fat)            9.194996e-02
cat__aisle_id_24                        9.020352e-02
remainder__Water                        8.880549e-02
...                                              ...
cat__aisle_id_85                        6.799863e-08
cat__aisle_id_39                        0.000000e+00
cat__aisle_id_62                        0.000000e+00
cat__aisle_id_18                        0.000000e+00
cat__aisle_id_74                        0.000000e+00

[111 rows x 1 columns]

import matplotlib.pyplot as plt

# Assuming 'feature_importances' is the DataFrame with your feature importance data
# Save to CSV
feature_importances.to_csv('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/feature_importance.csv', index=True)

# Read the saved CSV file for plotting
feature_importances_csv = pd.read_csv('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/feature_importance.csv')

# Selecting the top 10 most important features
top_10_features = feature_importances_csv.nlargest(10, 'Importance')

# Plotting
plt.figure(figsize=(10, 8))
plt.barh(top_10_features['Unnamed: 0'], top_10_features['Importance'])
plt.xlabel('Importance')
plt.ylabel('Features')
plt.title('Top 10 Feature Importance')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature on top
plt.show()

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the data
df = pd.read_csv('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/ReOrdersOnly/3ProductsNew.csv')

df.drop('product_name', axis=1, inplace=True)

# Fill missing values if necessary
df.fillna(df.mean(), inplace=True)

# Selecting features and target variable
features = ['aisle_id', 'department_id', 'cart_order', 'Water', 'Energy', 'Protein', 'Total lipid (fat)', 'Carbohydrate, by difference']
X = df[features]
y = df['order_count']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a machine learning pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('regressor', RandomForestRegressor())  # Regression model
])

# Parameters for GridSearchCV
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [10, 20, None],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}

# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predictions
predictions = best_model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"RMSE: {rmse}")

# Feature importances (optional)
feature_importances = pd.DataFrame(best_model.named_steps['regressor'].feature_importances_,
                                   index=features, columns=['Importance']).sort_values('Importance', ascending=False)
print(feature_importances)

RMSE: 3843.393523835483
                             Importance
Protein                        0.308573
cart_order                     0.232967
Carbohydrate, by difference    0.133038
aisle_id                       0.093537
Total lipid (fat)              0.088769
Water                          0.077117
department_id                  0.049956
Energy                         0.016044

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Load Data
df = pd.read_csv('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/ReOrdersOnly/3ProductsNew.csv')

# Prepare the Data for Training
X = df.drop(['order_count', 'product_id', 'product_name', 'cart_order'], axis=1)  # Drop the target variable and unwanted columns
y = df['order_count']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the datasets into DMatrix objects
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters for the XGBoost model
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "eta": 0.1,
    "max_depth": 6,
    # Add more parameters here based on your requirements and tuning
}

# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=100)

# Prediction
y_pred = model.predict(dtest)

# Saving predictions to a CSV file
pred_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
pred_df.to_csv('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/ReOrdersOnly/predictions.csv', index=False)

# Print feature importance
importance = model.get_score(importance_type='weight')
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
for feature, score in sorted_importance:
    print(f'Feature: {feature}, Score: {score}')

Feature: aisle_id, Score: 1079.0
Feature: Water, Score: 553.0
Feature: Protein, Score: 425.0
Feature: Total lipid (fat), Score: 321.0
Feature: department_id, Score: 295.0
Feature: Carbohydrate, by difference, Score: 229.0
Feature: Energy, Score: 227.0

# Plotting
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(len(sorted_importance)), [val[1] for val in sorted_importance], align='center')
plt.xticks(range(len(sorted_importance)), [val[0] for val in sorted_importance])
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.tight_layout()
plt.savefig('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/ReOrdersOnly/feature_importances.png')
plt.show()

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Make predictions
y_pred = model.predict(dtest)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

Root Mean Squared Error (RMSE): 4362.271885990202
Mean Absolute Error (MAE): 1857.8579180908202
R-squared: -2.7677509363329786

import pandas as pd

# Load the CSV files
file1 = '/Users/tannermartz/Documents/GitHub/tmartzDS/Output/ReOrdersOnly/3ProductsNew.csv' # Reorder Popularity Rank File
file2 = '/Users/tannermartz/Documents/GitHub/tmartzDS/Output/ReOrdersOnly/top_100_products_with_reorder_counts.csv' # First Time Order Popularity Rank File

# Read the first 100 rows from each file
df1 = pd.read_csv(file1).head(100)
df2 = pd.read_csv(file2).head(100)

# Add a rank column based on the existing order in each file
df1['rank_file1'] = range(1, len(df1) + 1)
df2['rank_file2'] = range(1, len(df2) + 1)

# Merge the two dataframes on the item identifier
merged_df = df1.merge(df2, on='product_name', how='inner')

# Calculate the difference in rank
merged_df['rank_difference'] = merged_df['rank_file1'] - merged_df['rank_file2']
merged_df['abs_rank_difference'] = merged_df['rank_difference']

# Sort by absolute rank difference and select top 10 largest differences
largest_diff = merged_df.sort_values(by='abs_rank_difference', ascending=False).head(10)

# Sort by absolute rank difference and select top 10 smallest differences
smallest_diff = merged_df.sort_values(by='abs_rank_difference', ascending=True).head(10)

# Display results
print("(Pursuaded into buying) Top 10 Products with Largest Rank Difference:")
print(largest_diff[['product_name', 'rank_file1', 'rank_file2', 'rank_difference']])

print("\n(Desuaded from buying) Top 10 Products with Smallest Rank Difference:")
print(smallest_diff[['product_name', 'rank_file1', 'rank_file2', 'rank_difference']])

# Optionally, save the results to new CSV files
#largest_diff.to_csv('path_to_save_largest_rank_differences.csv', index=False)
#smallest_diff.to_csv('path_to_save_smallest_rank_differences.csv', index=False)
merged_df.to_csv('/Users/tannermartz/Documents/GitHub/tmartzDS/Output/reorder_difference', index=False)

(Pursuaded into buying) Top 10 Products with Largest Rank Difference:
                         product_name  rank_file1  rank_file2  rank_difference
83   Organic Lacinato (Dinosaur) Kale          99          69               30
76                Organic Ginger Root          84          56               28
62                  Organic Red Onion          64          37               27
74                  Green Bell Pepper          79          52               27
73                        Red Peppers          78          51               27
77                 Small Hass Avocado          86          60               26
65      Organic Italian Parsley Bunch          68          43               25
57                   Organic Cilantro          59          34               25
75  Boneless Skinless Chicken Breasts          80          58               22
36             Organic Grape Tomatoes          38          19               19

(Desuaded from buying) Top 10 Products with Smallest Rank Difference:
                   product_name  rank_file1  rank_file2  rank_difference
16                         Soda          17          90              -73
53     Organic Reduced Fat Milk          55          93              -38
55                   Whole Milk          57          92              -35
22                Hass Avocados          23          57              -34
15                 Spring Water          16          49              -33
51       100% Raw Coconut Water          53          83              -30
56          Granny Smith Apples          58          88              -30
67        Sparkling Lemon Water          71         100              -29
37  Organic Reduced Fat 2% Milk          39          64              -25
58        Organic Bartlett Pear          60          84              -24

	product_id	order_count	reorder_count	perc_reorder	product_name	aisle_id	department_id	popularity_rank	cart_order
92	38689	36869	31394	85.15	Organic Reduced Fat Milk	84	16	93	4.72
0	24852	491291	415166	84.51	Banana	24	4	1	4.89
1	13176	394930	329275	83.38	Bag of Organic Bananas	24	4	2	5.10
9	27845	142813	118684	83.10	Organic Whole Milk	84	16	10	5.43
48	19660	58312	47392	81.27	Spring Water	115	7	49	4.55
63	5785	49374	39820	80.65	Organic Reduced Fat 2% Milk	84	16	64	5.75
4	47209	220877	176173	79.76	Organic Hass Avocado	24	4	5	6.78
82	3957	39271	30966	78.85	100% Raw Coconut Water	31	7	83	6.19
25	49235	79006	61801	78.22	Organic Half & Half	53	16	26	6.08
91	4210	36968	28837	78.01	Whole Milk	84	16	92	5.19
2	21137	275577	214448	77.82	Organic Strawberries	24	4	3	7.25
89	196	37298	29012	77.78	Soda	77	7	90	3.72
84	23909	38631	30036	77.75	2% Reduced Fat Milk	84	16	85	5.44
74	11520	42274	32821	77.64	Large Alfresco Eggs	86	16	75	5.79
3	21903	251705	194939	77.45	Organic Baby Spinach	123	4	4	7.43
24	44632	79245	61175	77.20	Sparkling Water Grapefruit	115	7	25	6.22
10	27966	142603	109688	76.92	Organic Raspberries	123	4	11	7.21
43	22035	61669	47204	76.54	Organic Whole String Cheese	21	16	44	8.07
34	27086	71641	54626	76.25	Half & Half	53	16	35	6.44
26	19057	78056	59489	76.21	Organic Large Extra Fancy Fuji Apple	24	4	27	7.51
5	47766	184224	140270	76.14	Organic Avocado	24	4	6	6.44
46	35951	60071	45558	75.84	Organic Unsweetened Almond Milk	91	16	47	6.63
61	24838	51738	38958	75.30	Unsweetened Almondmilk	91	16	62	6.12
56	12341	52497	39138	74.55	Hass Avocados	32	4	57	5.26
99	21709	34211	25479	74.48	Sparkling Lemon Water	115	7	100	6.88

Tutorial on Instacart Data Analysis¶

Project Overview¶

Objective¶

Approach¶

Insights¶

Methodology and Code¶

Conclusions¶

Tutorial¶

Introduction¶

Data Overview¶

Importing the Kaggle Data:¶

Importing the USDA Data Via API:¶

Exploratory Data Analysis¶

Top Products:¶

Top Aisles:¶

Top Departments:¶

Most Reordered Products¶

Model Building¶

Correlation Matrix:¶

Random Forest Regressor¶

Plotting Top Features:¶

Second ML Model:¶

XGBoost Regressor:¶

Plotting the XGBoost Feature Importance¶

Insights from Model Results¶

Metrics for Evaluation:¶

Conclusions:¶

Insights:¶

Conclusions and Future Directions¶