import pandas as pd
import seaborn as sns
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import os


print(os.getcwd())
path = "F:\[Personal]\Data Analytics Portfolio\Cohort Analysis\online_retail"
os.chdir(path)
print(os.getcwd())

F:\[Personal]\Data Analytics Portfolio\Cohort Analysis\online_retail
F:\[Personal]\Data Analytics Portfolio\Cohort Analysis\online_retail


df1 = pd.read_csv(r'F:\[Personal]\Data Analytics Portfolio\Cohort Analysis\online_retail\online2009_2010.csv',
                  encoding = 'unicode_escape', parse_dates=['InvoiceDate'], infer_datetime_format=True)
df2 = pd.read_csv(r'F:\[Personal]\Data Analytics Portfolio\Cohort Analysis\online_retail\online2010_2011.csv',
                  encoding = 'unicode_escape', parse_dates=['InvoiceDate'], infer_datetime_format=True)
df = df1.append(df2)

print(df.head())
print(df.shape)
print(df.dtypes)

df.isna().sum()

C:\Users\Tyler\AppData\Local\Temp\ipykernel_32664\260097437.py:5: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df = df1.append(df2)

  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

          InvoiceDate  Price  Customer ID         Country  
0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom  
1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom  
4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom  
(1067371, 8)
Invoice                object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
Price                 float64
Customer ID           float64
Country                object
dtype: object

Invoice             0
StockCode           0
Description      4382
Quantity            0
InvoiceDate         0
Price               0
Customer ID    243007
Country             0
dtype: int64


df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce', utc = True)

df['Price'] = df['Price'].abs()
print(df['Price'].sort_values(ascending = True))

df['Quantity'] = df['Quantity'].abs()
print(df['Quantity'].sort_values(ascending = True))

170484        0.00
374514        0.00
374513        0.00
424304        0.00
374512        0.00
            ...   
241827    25111.09
403472    38925.87
222681    38970.00
276274    44031.79
179403    53594.36
Name: Price, Length: 1067371, dtype: float64
541909        1
251307        1
124382        1
124383        1
124385        1
          ...  
90857     19152
61624     74215
61619     74215
540422    80995
540421    80995
Name: Quantity, Length: 1067371, dtype: int64


def get_month(x): return dt.datetime(x.year, x.month, 1)
df['InvoiceMonth'] = df['InvoiceDate'].apply(get_month)
grouping = df.groupby('Customer ID')['InvoiceMonth']
df['CohortMonth'] = grouping.transform('min')
print(df.head())

  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

                InvoiceDate  Price  Customer ID         Country InvoiceMonth  \
0 2009-12-01 07:45:00+00:00   6.95      13085.0  United Kingdom   2009-12-01   
1 2009-12-01 07:45:00+00:00   6.75      13085.0  United Kingdom   2009-12-01   
2 2009-12-01 07:45:00+00:00   6.75      13085.0  United Kingdom   2009-12-01   
3 2009-12-01 07:45:00+00:00   2.10      13085.0  United Kingdom   2009-12-01   
4 2009-12-01 07:45:00+00:00   1.25      13085.0  United Kingdom   2009-12-01   

  CohortMonth  
0  2009-12-01  
1  2009-12-01  
2  2009-12-01  
3  2009-12-01  
4  2009-12-01


def get_date_int(df_column):
    year = df_column.dt.year
    month = df_column.dt.month
    return year, month


invoice_year, invoice_month = get_date_int(df['InvoiceMonth'])
cohort_year, cohort_month = get_date_int(df['CohortMonth'])
years_diff = invoice_year - cohort_year
months_diff = invoice_month - cohort_month
df['CohortIndex'] = years_diff * 12 + months_diff + 1
df.head()


grouping = df.groupby(['CohortMonth', 'CohortIndex'])
cohort_data = grouping['Customer ID'].apply(pd.Series.nunique).reset_index()
cohort_data.columns = ['CohortMonth', 'CohortIndex', 'Count of unique Customer ID']
retention_counts = cohort_data.pivot(index = 'CohortMonth', columns = 'CohortIndex', values = 'Count of unique Customer ID')
print(retention_counts.head())

CohortIndex    1.0    2.0    3.0    4.0    5.0    6.0    7.0    8.0    9.0   \
CohortMonth                                                                   
2009-12-01   1045.0  392.0  358.0  447.0  410.0  408.0  408.0  374.0  355.0   
2010-01-01    394.0   86.0  119.0  120.0  110.0  115.0  105.0   91.0  114.0   
2010-02-01    363.0  109.0   82.0  110.0   93.0   76.0   79.0  103.0  100.0   
2010-03-01    436.0   95.0  113.0  103.0  100.0   87.0  105.0  130.0  126.0   
2010-04-01    291.0   67.0   58.0   47.0   54.0   67.0   79.0   76.0   33.0   

CohortIndex   10.0  ...   16.0   17.0   18.0   19.0   20.0   21.0   22.0  \
CohortMonth         ...                                                    
2009-12-01   392.0  ...  319.0  273.0  316.0  303.0  287.0  274.0  332.0   
2010-01-01   134.0  ...   60.0   86.0   74.0   69.0   73.0   93.0   73.0   
2010-02-01   106.0  ...   74.0   67.0   61.0   53.0   85.0   90.0   62.0   
2010-03-01    50.0  ...   74.0   76.0   69.0   74.0   89.0   93.0   33.0   
2010-04-01    34.0  ...   43.0   41.0   41.0   50.0   61.0   19.0    NaN   

CohortIndex   23.0   24.0   25.0  
CohortMonth                       
2009-12-01   319.0  427.0  218.0  
2010-01-01    88.0   29.0    NaN  
2010-02-01    23.0    NaN    NaN  
2010-03-01     NaN    NaN    NaN  
2010-04-01     NaN    NaN    NaN  

[5 rows x 25 columns]


cohort_sizes = retention_counts.iloc[:,0]
retention = retention_counts.divide(cohort_sizes, axis = 0)
retention = retention.round(3)


grouping = df.groupby(['CohortMonth', 'CohortIndex'])
cohort_data = grouping['Quantity'].mean()
cohort_data = cohort_data.reset_index()
average_quantity = cohort_data.pivot(index = 'CohortMonth', columns = 'CohortIndex', values = 'Quantity')
average_quantity.round(1)
print(average_quantity)

CohortIndex       1.0        2.0        3.0        4.0        5.0        6.0   \
CohortMonth                                                                     
2009-12-01   12.912972  22.392774  21.002176  21.545624  13.489181  15.524853   
2010-01-01    9.127481  12.959646   9.310018  12.699391  11.440364  11.780553   
2010-02-01    9.824130   9.544484  15.465889  12.873996   9.961326  11.856862   
2010-03-01   10.050794  14.106752  12.363773  16.483107  11.027831  16.003225   
2010-04-01   10.483552   9.090211   9.251445  14.830010  12.915596  10.788171   
2010-05-01   12.026839  10.114416  10.121827  12.415094  12.768493   9.234390   
2010-06-01   10.890266   8.492754  13.273333  10.518919  10.964965  14.014423   
2010-07-01    8.450731   9.741135   6.935619   8.494375   7.100095   6.770413   
2010-08-01    8.027181   8.408827   8.678806   7.527053  11.092126   7.532110   
2010-09-01   26.611814  12.039925   8.200000  11.223731  15.778833   7.881416   
2010-10-01    8.814993   5.978168   6.642483   9.759259   6.793619   5.119891   
2010-11-01    7.747814   8.340299  11.846633   8.061026   7.671937   9.593243   
2010-12-01    7.157130   9.152174  27.830645   7.684932  15.864322   4.186207   
2011-01-01   11.854582   4.591837   5.163683  76.079710  11.118605  11.288754   
2011-02-01   10.073135  18.539535  37.074797   9.797158  12.005556   7.878104   
2011-03-01    9.179201  11.681360  15.765317   9.047284  14.790343  12.684564   
2011-04-01    8.600313   8.869448   6.575318   9.063098   7.736842   8.443137   
2011-05-01   10.111972  10.612782  13.744722  13.268882  10.802432   8.873239   
2011-06-01    9.430696  17.115506  10.212963  12.971178   9.548159   9.800857   
2011-07-01    8.949741  12.350785   6.444840   7.538911   5.838122   7.078014   
2011-08-01    9.517952   6.008270   4.979056   5.659867   6.738922        NaN   
2011-09-01   10.712862   6.454060   8.083259   8.020672        NaN        NaN   
2011-10-01    8.688452   7.271842   8.035336        NaN        NaN        NaN   
2011-11-01   10.287438  10.268349        NaN        NaN        NaN        NaN   
2011-12-01   19.736600        NaN        NaN        NaN        NaN        NaN   

CohortIndex       7.0        8.0        9.0        10.0  ...       16.0  \
CohortMonth                                              ...              
2009-12-01   14.009239  14.098430  24.418046  22.806182  ...  15.284899   
2010-01-01   10.625257  12.507528  11.080238  13.870807  ...  13.706250   
2010-02-01   11.885811  10.243604   9.455748  10.670219  ...  11.435616   
2010-03-01   14.335674  12.695394  11.356984  16.073462  ...  10.500566   
2010-04-01    9.898980   9.457207   8.988688  10.202051  ...  12.882159   
2010-05-01    7.282660   6.212199  11.038690  11.399491  ...  11.381748   
2010-06-01   11.262019  17.495922  31.331407  28.796662  ...  18.551336   
2010-07-01    8.179732   9.341121   7.617992   7.098972  ...   8.189272   
2010-08-01    8.181818   7.779904  12.611446  15.644764  ...  10.920354   
2010-09-01   13.459770  11.932039  19.327476  14.425676  ...   9.643791   
2010-10-01    6.453952   6.202312   9.483128  10.547930  ...        NaN   
2010-11-01    8.868336  13.692460   9.767196  14.954545  ...        NaN   
2010-12-01    8.325000  21.222222  20.567568  10.794118  ...        NaN   
2011-01-01    8.495356  15.159664  10.704545  10.034783  ...        NaN   
2011-02-01   13.443299  19.333333  15.503563  15.815842  ...        NaN   
2011-03-01   13.552219  13.483680  11.194986   9.654717  ...        NaN   
2011-04-01    6.097222   9.041078  10.096774        NaN  ...        NaN   
2011-05-01   12.719943   8.209220        NaN        NaN  ...        NaN   
2011-06-01    9.849315        NaN        NaN        NaN  ...        NaN   
2011-07-01         NaN        NaN        NaN        NaN  ...        NaN   
2011-08-01         NaN        NaN        NaN        NaN  ...        NaN   
2011-09-01         NaN        NaN        NaN        NaN  ...        NaN   
2011-10-01         NaN        NaN        NaN        NaN  ...        NaN   
2011-11-01         NaN        NaN        NaN        NaN  ...        NaN   
2011-12-01         NaN        NaN        NaN        NaN  ...        NaN   

CohortIndex       17.0       18.0       19.0       20.0       21.0       22.0  \
CohortMonth                                                                     
2009-12-01   14.024534  15.156950  16.810665  15.322181  19.590132  16.302038   
2010-01-01   11.184763  10.943503  15.514440  12.589155  13.392586  14.659149   
2010-02-01   11.418232  17.997037   9.968406  10.881448   9.964938   8.442779   
2010-03-01   14.143212  11.791918  13.600384   9.852425  13.283128  10.138376   
2010-04-01   14.693694  11.455912  10.638767   9.379231  10.303249        NaN   
2010-05-01   15.394605   9.058282   8.719807   7.931034        NaN        NaN   
2010-06-01   12.738862  10.219048  12.050209        NaN        NaN        NaN   
2010-07-01    7.738757  11.452663        NaN        NaN        NaN        NaN   
2010-08-01   12.750831        NaN        NaN        NaN        NaN        NaN   
2010-09-01         NaN        NaN        NaN        NaN        NaN        NaN   
2010-10-01         NaN        NaN        NaN        NaN        NaN        NaN   
2010-11-01         NaN        NaN        NaN        NaN        NaN        NaN   
2010-12-01         NaN        NaN        NaN        NaN        NaN        NaN   
2011-01-01         NaN        NaN        NaN        NaN        NaN        NaN   
2011-02-01         NaN        NaN        NaN        NaN        NaN        NaN   
2011-03-01         NaN        NaN        NaN        NaN        NaN        NaN   
2011-04-01         NaN        NaN        NaN        NaN        NaN        NaN   
2011-05-01         NaN        NaN        NaN        NaN        NaN        NaN   
2011-06-01         NaN        NaN        NaN        NaN        NaN        NaN   
2011-07-01         NaN        NaN        NaN        NaN        NaN        NaN   
2011-08-01         NaN        NaN        NaN        NaN        NaN        NaN   
2011-09-01         NaN        NaN        NaN        NaN        NaN        NaN   
2011-10-01         NaN        NaN        NaN        NaN        NaN        NaN   
2011-11-01         NaN        NaN        NaN        NaN        NaN        NaN   
2011-12-01         NaN        NaN        NaN        NaN        NaN        NaN   

CohortIndex       23.0       24.0     25.0  
CohortMonth                                 
2009-12-01   19.549521  13.362629  14.8261  
2010-01-01    9.380492   8.014957      NaN  
2010-02-01   11.790378        NaN      NaN  
2010-03-01         NaN        NaN      NaN  
2010-04-01         NaN        NaN      NaN  
2010-05-01         NaN        NaN      NaN  
2010-06-01         NaN        NaN      NaN  
2010-07-01         NaN        NaN      NaN  
2010-08-01         NaN        NaN      NaN  
2010-09-01         NaN        NaN      NaN  
2010-10-01         NaN        NaN      NaN  
2010-11-01         NaN        NaN      NaN  
2010-12-01         NaN        NaN      NaN  
2011-01-01         NaN        NaN      NaN  
2011-02-01         NaN        NaN      NaN  
2011-03-01         NaN        NaN      NaN  
2011-04-01         NaN        NaN      NaN  
2011-05-01         NaN        NaN      NaN  
2011-06-01         NaN        NaN      NaN  
2011-07-01         NaN        NaN      NaN  
2011-08-01         NaN        NaN      NaN  
2011-09-01         NaN        NaN      NaN  
2011-10-01         NaN        NaN      NaN  
2011-11-01         NaN        NaN      NaN  
2011-12-01         NaN        NaN      NaN  

[25 rows x 25 columns]


plt.figure(figsize=(16,10))
plt.title('Retention Rates')
sns.heatmap(data = retention, annot = True, fmt = '.0%', vmin = 0.0, vmax = 0.6, cmap = 'BuGn')
plt.ylabel('Cohort Month')
plt.xlabel('Cohort Index')
plt.show()

Cohort Analysis Online Retail Sample Dataset¶

Importing, Cleaning, and Setup¶

Establishing Key Date Functions¶

Establishing Cohorts and Data Processing¶

Visualizing Retention Rates in the Cohort¶

Thanks for reading this. Just a brief note that our work here isn't exclusive and can (and will) be combined with RFM analysis to form a more complete picture. See my portfolio for a link to that.¶

	Invoice	StockCode	Description	Quantity	InvoiceDate	Price	Customer ID	Country	InvoiceMonth	CohortMonth	CohortIndex
0	489434	85048	15CM CHRISTMAS GLASS BALL 20 LIGHTS	12	2009-12-01 07:45:00+00:00	6.95	13085.0	United Kingdom	2009-12-01	2009-12-01	1.0
1	489434	79323P	PINK CHERRY LIGHTS	12	2009-12-01 07:45:00+00:00	6.75	13085.0	United Kingdom	2009-12-01	2009-12-01	1.0
2	489434	79323W	WHITE CHERRY LIGHTS	12	2009-12-01 07:45:00+00:00	6.75	13085.0	United Kingdom	2009-12-01	2009-12-01	1.0
3	489434	22041	RECORD FRAME 7" SINGLE SIZE	48	2009-12-01 07:45:00+00:00	2.10	13085.0	United Kingdom	2009-12-01	2009-12-01	1.0
4	489434	21232	STRAWBERRY CERAMIC TRINKET BOX	24	2009-12-01 07:45:00+00:00	1.25	13085.0	United Kingdom	2009-12-01	2009-12-01	1.0