import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from pandas.plotting import parallel_coordinates


print(os.getcwd())
path = "F:\[Personal]\Data Analytics Portfolio\Market Basket Analysis\mba"
os.chdir(path)
print(os.getcwd())

F:\[Personal]\Data Analytics Portfolio\Market Basket Analysis\mba
F:\[Personal]\Data Analytics Portfolio\Market Basket Analysis\mba


df = pd.read_csv(r'F:\[Personal]\Data Analytics Portfolio\Market Basket Analysis\mba\groceries - groceries.csv')

# Our data has a column that we do not need for analysis so we will be dropping it altogether

df.drop(columns = df.columns[0], axis = 1, inplace = True)


# Transforming the dataframe into a list of lists representing transactions

transactions = df.values.astype(str).tolist()

# Delete the nans in the list of lists

transactions = [[item for item in row if item != 'nan'] for row in transactions]

# Print first 5 lists to check our work

print(transactions[:5])

[['citrus fruit', 'semi-finished bread', 'margarine', 'ready soups'], ['tropical fruit', 'yogurt', 'coffee'], ['whole milk'], ['pip fruit', 'yogurt', 'cream cheese', 'meat spreads'], ['other vegetables', 'whole milk', 'condensed milk', 'long life bakery product']]


# First, we instantiate the transition encoder which will enable us to onehot encode the data

encoder = TransactionEncoder().fit(transactions)

# Perform onehot encoding (could be chained above but calling out separately for readability)

onehot = encoder.transform(transactions)

# Convert onehot to a DataFrame

onehot = pd.DataFrame(onehot, columns = encoder.columns_)

# Check output

print(onehot)

      Instant food products  UHT-milk  abrasive cleaner  artif. sweetener  \
0                     False     False             False             False   
1                     False     False             False             False   
2                     False     False             False             False   
3                     False     False             False             False   
4                     False     False             False             False   
...                     ...       ...               ...               ...   
9830                  False     False             False             False   
9831                  False     False             False             False   
9832                  False     False             False             False   
9833                  False     False             False             False   
9834                  False     False             False             False   

      baby cosmetics  baby food   bags  baking powder  bathroom cleaner  \
0              False      False  False          False             False   
1              False      False  False          False             False   
2              False      False  False          False             False   
3              False      False  False          False             False   
4              False      False  False          False             False   
...              ...        ...    ...            ...               ...   
9830           False      False  False          False             False   
9831           False      False  False          False             False   
9832           False      False  False          False             False   
9833           False      False  False          False             False   
9834           False      False  False          False             False   

       beef  ...  turkey  vinegar  waffles  whipped/sour cream  whisky  \
0     False  ...   False    False    False               False   False   
1     False  ...   False    False    False               False   False   
2     False  ...   False    False    False               False   False   
3     False  ...   False    False    False               False   False   
4     False  ...   False    False    False               False   False   
...     ...  ...     ...      ...      ...                 ...     ...   
9830   True  ...   False    False    False                True   False   
9831  False  ...   False    False    False               False   False   
9832  False  ...   False    False    False               False   False   
9833  False  ...   False    False    False               False   False   
9834  False  ...   False     True    False               False   False   

      white bread  white wine  whole milk  yogurt  zwieback  
0           False       False       False   False     False  
1           False       False       False    True     False  
2           False       False        True   False     False  
3           False       False       False    True     False  
4           False       False        True   False     False  
...           ...         ...         ...     ...       ...  
9830        False       False        True   False     False  
9831        False       False       False   False     False  
9832        False       False       False    True     False  
9833        False       False       False   False     False  
9834        False       False       False   False     False  

[9835 rows x 169 columns]


# Computing Support for Single Items

print(onehot.mean())

Instant food products    0.008033
UHT-milk                 0.033452
abrasive cleaner         0.003559
artif. sweetener         0.003254
baby cosmetics           0.000610
                           ...   
white bread              0.042095
white wine               0.019014
whole milk               0.255516
yogurt                   0.139502
zwieback                 0.006914
Length: 169, dtype: float64


frequent_itemsets = apriori(onehot, min_support = 0.001, use_colnames = True)

print(len(frequent_itemsets))

13492


rules = association_rules(frequent_itemsets, metric = 'support', min_threshold = 0.001)
print(rules[:5])

               antecedents              consequents  antecedent support  \
0          (bottled water)  (Instant food products)            0.110524   
1  (Instant food products)          (bottled water)            0.008033   
2  (Instant food products)                 (butter)            0.008033   
3                 (butter)  (Instant food products)            0.055414   
4           (citrus fruit)  (Instant food products)            0.082766   

   consequent support   support  confidence      lift  leverage  conviction  
0            0.008033  0.001017    0.009200  1.145296  0.000129    1.001178  
1            0.110524  0.001017    0.126582  1.145296  0.000129    1.018386  
2            0.055414  0.001220    0.151899  2.741145  0.000775    1.113765  
3            0.008033  0.001220    0.022018  2.741145  0.000775    1.014301  
4            0.008033  0.001118    0.013514  1.682347  0.000454    1.005556


print(frequent_itemsets.nlargest(n = 15, columns = 'support'))

      support            itemsets
154  0.255516        (whole milk)
96   0.193493  (other vegetables)
115  0.183935        (rolls/buns)
130  0.174377              (soda)
155  0.139502            (yogurt)
10   0.110524     (bottled water)
116  0.108998   (root vegetables)
147  0.104931    (tropical fruit)
125  0.098526     (shopping bags)
122  0.093950           (sausage)
99   0.088968            (pastry)
28   0.082766      (citrus fruit)
9    0.080529      (bottled beer)
89   0.079817        (newspapers)
18   0.077682       (canned beer)


plt.figure(figsize=(12,6))
plt.xticks(rotation = 90)
sns.barplot(x = 'itemsets', y = 'support', data = frequent_itemsets.nlargest(n = 15, columns = 'support'))

<Axes: xlabel='itemsets', ylabel='support'>


print(rules.columns)

Index(['antecedents', 'consequents', 'antecedent support',
       'consequent support', 'support', 'confidence', 'lift', 'leverage',
       'conviction'],
      dtype='object')


print(rules[['antecedents','consequents']])

                    antecedents  \
0               (bottled water)   
1       (Instant food products)   
2       (Instant food products)   
3                      (butter)   
4                (citrus fruit)   
...                         ...   
102761     (whipped/sour cream)   
102762         (tropical fruit)   
102763        (root vegetables)   
102764                 (yogurt)   
102765             (whole milk)   

                                              consequents  
0                                 (Instant food products)  
1                                         (bottled water)  
2                                                (butter)  
3                                 (Instant food products)  
4                                 (Instant food products)  
...                                                   ...  
102761  (other vegetables, tropical fruit, root vegeta...  
102762  (other vegetables, whipped/sour cream, root ve...  
102763  (other vegetables, whipped/sour cream, tropica...  
102764  (other vegetables, whipped/sour cream, tropica...  
102765  (other vegetables, whipped/sour cream, tropica...  

[102766 rows x 2 columns]


frequent_itemsets2 = apriori(onehot, min_support = 0.0001, use_colnames = True, max_len = 2)
rules2 = association_rules(frequent_itemsets2)


# Convert antecedents and consequents into strings

rules2['antecedents'] = rules2['antecedents'].apply(lambda a: ','.join(list(a)))
rules2['consequents'] = rules2['consequents'].apply(lambda a: ','.join(list(a)))

# Print to check

print(rules2[['antecedents','consequents']])

              antecedents             consequents
0               baby food             brown bread
1               baby food                  butter
2               baby food             butter milk
3               baby food                cake bar
4               baby food               chocolate
5               baby food                    curd
6               baby food                 dessert
7               baby food       finished products
8               baby food       frozen vegetables
9               baby food   fruit/vegetable juice
10              baby food        other vegetables
11              baby food                   pasta
12              baby food                  pastry
13              baby food               pip fruit
14              baby food              rolls/buns
15              baby food             salty snack
16              baby food                 sausage
17              baby food           shopping bags
18              baby food             soft cheese
19              baby food                   soups
20              baby food          tropical fruit
21              baby food                 waffles
22              baby food      whipped/sour cream
23              baby food             white bread
24              baby food                  yogurt
25   sound storage medium           bottled water
26   sound storage medium                   candy
27   sound storage medium                cat food
28  preservation products            citrus fruit
29   sound storage medium  frozen potato products
30   sound storage medium                     ham
31   sound storage medium                  pastry
32  preservation products      whipped/sour cream
33   sound storage medium           shopping bags
34   sound storage medium                    soda
35   sound storage medium             white bread


# Transform the antecedent, consequent, and support columns into a matrix

support_table = rules2.pivot(index = 'consequents', columns = 'antecedents', values = 'support')

# Generate heatmap of Support

fig, ax = plt.subplots(figsize=(3, 5))
sns.heatmap(support_table, cmap = 'rocket_r', cbar_kws={'label': 'Support'})
plt.show()


sns.scatterplot(x = 'antecedent support', y = 'consequent support', size = 'lift', data = rules2)
plt.show()


# To make this plot, we must convert our rules to coordinates
# To keep things clean and separate, let's use a 3 to denote our code

frequent_itemsets3 = apriori(onehot, min_support = 0.0001, use_colnames = True, max_len = 2)
rules3 = association_rules(frequent_itemsets3)

# Extract the first item in each set and store it as a scalar value. This allows us to use coords properly next.

rules3['antecedent'] = rules3['antecedents'].apply(lambda antecedent: list(antecedent)[0])
rules3['consequent'] = rules3['consequents'].apply(lambda consequent: list(consequent)[0])

rules3['rule'] = rules3.index

# Define coordinates and label

coords = rules3[['antecedent','consequent','rule']]


# Generate the plot, indicating Support

import matplotlib as mpl

fig, ax = plt.subplots(figsize=(3, 10))
cmap = plt.get_cmap('rainbow')
rev_map = cmap.reversed()
bounds = np.arange(rules3['rule'].min(), rules3['rule'].max())
norm = mpl.colors.BoundaryNorm(bounds, 256)
parallel_coordinates(coords, 'rule', colormap = rev_map, ax = ax)
ax.legend_.remove()
plt.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=rev_map), ax=ax, orientation = 'horizontal', label='Support')
plt.show()


# Generate the plot, indicating Lift

frequent_itemsets4 = apriori(onehot, min_support = 0.0001, use_colnames = True, max_len = 2)
rules4 = association_rules(frequent_itemsets4, metric = 'lift', min_threshold = 100)

# Extract the first item in each set and store it as a scalar value. This allows us to use coords properly next.

rules4['antecedent'] = rules4['antecedents'].apply(lambda antecedent: list(antecedent)[0])
rules4['consequent'] = rules4['consequents'].apply(lambda consequent: list(consequent)[0])

rules4['rule'] = rules4.index

# Define coordinates and label

coords = rules4[['antecedent','consequent','rule']]

import matplotlib as mpl

fig, ax = plt.subplots(figsize=(3, 10))
cmap = plt.get_cmap('rainbow')
rev_map = cmap.reversed()
bounds = np.arange(rules4['rule'].min(), rules4['rule'].max())
norm = mpl.colors.BoundaryNorm(bounds, 256)
parallel_coordinates(coords, 'rule', colormap = rev_map, ax = ax)
ax.legend_.remove()
plt.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=rev_map), ax=ax, orientation = 'horizontal', label='Lift')
plt.show()

Market Basket Analysis in Python for Grocery Dataset¶

Introduction & Data Commentary¶

Importing Packages Needed for Analysis¶

Read In Data and Initial Transformations¶

Calculating the Support Metric¶

Using the Aprori Algorithm¶

Exploring Our Newly Made Rules¶

Visualization: Heatmap¶

Visualization: Scatterplot¶

Visualization: Parallel Coordinates Plot¶

Results & Discussion¶