import pandas as pd
import seaborn as sns
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import os


print(os.getcwd())
path = "F:\[Personal]\Data Analytics Portfolio\Cohort Analysis\online_retail"
os.chdir(path)
print(os.getcwd())

C:\Users\Tyler
F:\[Personal]\Data Analytics Portfolio\Cohort Analysis\online_retail


df1 = pd.read_csv(r'F:\[Personal]\Data Analytics Portfolio\Cohort Analysis\online_retail\online2009_2010.csv',
                  encoding = 'unicode_escape', parse_dates=['InvoiceDate'], infer_datetime_format=True)
df2 = pd.read_csv(r'F:\[Personal]\Data Analytics Portfolio\Cohort Analysis\online_retail\online2010_2011.csv',
                  encoding = 'unicode_escape', parse_dates=['InvoiceDate'], infer_datetime_format=True)
df = df1.append(df2)

print(df.head())
print(df.shape)
print(df.dtypes)

df.isna().sum()

C:\Users\Tyler\AppData\Local\Temp\ipykernel_21604\260097437.py:5: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df = df1.append(df2)

  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

          InvoiceDate  Price  Customer ID         Country  
0 2009-12-01 07:45:00   6.95      13085.0  United Kingdom  
1 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
2 2009-12-01 07:45:00   6.75      13085.0  United Kingdom  
3 2009-12-01 07:45:00   2.10      13085.0  United Kingdom  
4 2009-12-01 07:45:00   1.25      13085.0  United Kingdom  
(1067371, 8)
Invoice                object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
Price                 float64
Customer ID           float64
Country                object
dtype: object

Invoice             0
StockCode           0
Description      4382
Quantity            0
InvoiceDate         0
Price               0
Customer ID    243007
Country             0
dtype: int64


df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce', utc = True)

df['Price'] = df['Price'].abs()
print(df['Price'].sort_values(ascending = True))

df['Quantity'] = df['Quantity'].abs()
print(df['Quantity'].sort_values(ascending = True))

170484        0.00
374514        0.00
374513        0.00
424304        0.00
374512        0.00
            ...   
241827    25111.09
403472    38925.87
222681    38970.00
276274    44031.79
179403    53594.36
Name: Price, Length: 1067371, dtype: float64
541909        1
251307        1
124382        1
124383        1
124385        1
          ...  
90857     19152
61624     74215
61619     74215
540422    80995
540421    80995
Name: Quantity, Length: 1067371, dtype: int64


df['Spend'] = df['Quantity'] * df['Price']
print(df.head())

  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

                InvoiceDate  Price  Customer ID         Country  Spend  
0 2009-12-01 07:45:00+00:00   6.95      13085.0  United Kingdom   83.4  
1 2009-12-01 07:45:00+00:00   6.75      13085.0  United Kingdom   81.0  
2 2009-12-01 07:45:00+00:00   6.75      13085.0  United Kingdom   81.0  
3 2009-12-01 07:45:00+00:00   2.10      13085.0  United Kingdom  100.8  
4 2009-12-01 07:45:00+00:00   1.25      13085.0  United Kingdom   30.0


spend_quartiles = pd.qcut(df['Spend'], q = 4, labels = range(1,5))
df['Spend_Quartile'] = spend_quartiles
df.sort_values('Spend')


# Define present_day like this because we did the same to invoice date in our csv read
present_day = pd.to_datetime('2012-01-01', errors='coerce', utc = True)
df['Recency_Days'] = (present_day - df['InvoiceDate']).dt.days
print(df.head())

  Invoice StockCode                          Description  Quantity  \
0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
1  489434    79323P                   PINK CHERRY LIGHTS        12   
2  489434    79323W                  WHITE CHERRY LIGHTS        12   
3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   

                InvoiceDate  Price  Customer ID         Country  Spend  \
0 2009-12-01 07:45:00+00:00   6.95      13085.0  United Kingdom   83.4   
1 2009-12-01 07:45:00+00:00   6.75      13085.0  United Kingdom   81.0   
2 2009-12-01 07:45:00+00:00   6.75      13085.0  United Kingdom   81.0   
3 2009-12-01 07:45:00+00:00   2.10      13085.0  United Kingdom  100.8   
4 2009-12-01 07:45:00+00:00   1.25      13085.0  United Kingdom   30.0   

  Spend_Quartile  Recency_Days Recency_Quartile  
0              4           760                1  
1              4           760                1  
2              4           760                1  
3              4           760                1  
4              4           760                1


# Creating numbered labels

r_labels = list(range(4,0,-1))

# Divide into groups based on quartiles

recency_quartiles = pd.qcut(df['Recency_Days'], q = 4, labels = r_labels)

# Creating a new column for recency quartiles

df['Recency_Quartile'] = recency_quartiles

df.sort_values('Recency_Days')


# Creating string labels

r_labels = ['Active', 'Lapsed', 'Inactive', 'Churned']

# Divide into quartiles using these labels

recency_quartiles = pd.qcut(df['Recency_Days'], q = 4, labels = r_labels)

df['Recency_Quartile'] = recency_quartiles

# Sort values from lowest to highest

df.sort_values('Recency_Days')


# Aggregate data on the customer level (instead of invoice level like now)

cust_lev = df.groupby(['Customer ID']).agg({'InvoiceDate': lambda x: (present_day - x.max()).days,
                                           'Invoice' : 'count',
                                           'Spend' : 'sum'})

# Rename the columns to interpret more easily

cust_lev.rename(columns = {'InvoiceDate': 'Recency',
                           'Invoice' : 'Frequency',
                           'Spend' : 'MonetaryValue'}, inplace = True)

print(cust_lev.head())

             Recency  Frequency  MonetaryValue
Customer ID                                   
12346.0          347         48      155177.60
12347.0           24        253        5633.32
12348.0           97         51        2019.40
12349.0           40        180        4452.84
12350.0          332         17         334.40


# Recency Quartile

r_labels = range(4,0,-1)
r_quartiles = pd.qcut(cust_lev['Recency'], 4, labels = r_labels)
cust_lev = cust_lev.assign(R = r_quartiles.values)


# Frequency and Monetary Quartiles

f_labels = range(1,5)
m_labels = range(1,5)
f_quartiles = pd.qcut(cust_lev['Frequency'], 4, labels = f_labels)
m_quartiles = pd.qcut(cust_lev['MonetaryValue'], 4, labels = m_labels)
cust_lev = cust_lev.assign(F = f_quartiles.values)
cust_lev = cust_lev.assign(M = m_quartiles.values)


# Build RFM Segment and RFM Score by concatenating RFM quartile values to RFM_Segment and sum RFM quartile values to RFM_Score

def join_rfm(x): return str(x['R']) + str(x['F']) + str(x['M'])

cust_lev['RFM_Segment'] = cust_lev.apply(join_rfm, axis=1)
cust_lev['RFM_Score'] = cust_lev[['R', 'F', 'M']].sum(axis=1)


# Largest RFM Segments

cust_lev.groupby('RFM_Segment').size().sort_values(ascending=False)[:10]

RFM_Segment
4.04.04.0    628
1.01.01.0    576
3.04.04.0    339
2.01.01.0    293
2.03.03.0    247
2.02.02.0    246
3.03.03.0    242
1.02.02.0    231
4.03.03.0    211
3.02.02.0    186
dtype: int64


# Summary Metrics per RFM Score

cust_lev.groupby('RFM_Score').agg({
    'Recency' : 'mean',
    'Frequency' : 'mean',
    'MonetaryValue' : ['mean', 'count'] }).round(1)


# Grouping Into Named Segments

def segment_name(df):
    if df['RFM_Score'] >= 9:
        return 'Gold'
    elif (df['RFM_Score'] >= 5) and (df['RFM_Score'] < 9):
        return 'Silver'
    else:
        return 'Bronze'
    
cust_lev['General_Segment'] = cust_lev.apply(segment_name, axis = 1)
cust_lev.groupby('General_Segment').agg({
    'Recency' : 'mean',
    'Frequency' : 'mean',
    'MonetaryValue' : ['mean', 'count'] }).round(1)


print(cust_lev.head())

             Recency  Frequency  MonetaryValue  R  F  M RFM_Segment  \
Customer ID                                                           
12346.0          347         48      155177.60  2  2  4   2.02.04.0   
12347.0           24        253        5633.32  4  4  4   4.04.04.0   
12348.0           97         51        2019.40  3  2  3   3.02.03.0   
12349.0           40        180        4452.84  4  4  4   4.04.04.0   
12350.0          332         17         334.40  2  1  1   2.01.01.0   

             RFM_Score General_Segment  
Customer ID                             
12346.0              8          Silver  
12347.0             12            Gold  
12348.0              8          Silver  
12349.0             12            Gold  
12350.0              4          Bronze


# Clean up the RFM Segment column to make more readable

cust_lev['RFM_Segment'] = cust_lev.R.astype(str) + cust_lev.F.astype(str) + cust_lev.M.astype(str)

print(cust_lev.head())

             Recency  Frequency  MonetaryValue  R  F  M RFM_Segment  \
Customer ID                                                           
12346.0          347         48      155177.60  2  2  4         224   
12347.0           24        253        5633.32  4  4  4         444   
12348.0           97         51        2019.40  3  2  3         323   
12349.0           40        180        4452.84  4  4  4         444   
12350.0          332         17         334.40  2  1  1         211   

             RFM_Score General_Segment  
Customer ID                             
12346.0              8          Silver  
12347.0             12            Gold  
12348.0              8          Silver  
12349.0             12            Gold  
12350.0              4          Bronze


# Filter for top customers

cust_lev[cust_lev['RFM_Segment'] == '444'].sort_values('MonetaryValue', ascending = False).head()

	Invoice	StockCode	Description	Quantity	InvoiceDate	Price	Customer ID	Country	Spend	Spend_Quartile
282875	561663	22501	incorrectly put back into stock	108	2011-07-28 16:40:00+00:00	0.00	NaN	United Kingdom	0.00	1
408200	528464	21146	NaN	30	2010-10-22 11:36:00+00:00	0.00	NaN	United Kingdom	0.00	1
524622	580609	22927	Amazon	1	2011-12-05 11:41:00+00:00	0.00	NaN	United Kingdom	0.00	1
83570	497186	84868A	NaN	13	2010-02-05 17:45:00+00:00	0.00	NaN	United Kingdom	0.00	1
408201	528465	21010	NaN	59	2010-10-22 11:36:00+00:00	0.00	NaN	United Kingdom	0.00	1
...	...	...	...	...	...	...	...	...	...	...
179403	A506401	B	Adjust bad debt	1	2010-04-29 13:36:00+00:00	53594.36	NaN	United Kingdom	53594.36	4
61619	541431	23166	MEDIUM CERAMIC TOP STORAGE JAR	74215	2011-01-18 10:01:00+00:00	1.04	12346.0	United Kingdom	77183.60	4
61624	C541433	23166	MEDIUM CERAMIC TOP STORAGE JAR	74215	2011-01-18 10:17:00+00:00	1.04	12346.0	United Kingdom	77183.60	4
540421	581483	23843	PAPER CRAFT , LITTLE BIRDIE	80995	2011-12-09 09:15:00+00:00	2.08	16446.0	United Kingdom	168469.60	4
540422	C581484	23843	PAPER CRAFT , LITTLE BIRDIE	80995	2011-12-09 09:27:00+00:00	2.08	16446.0	United Kingdom	168469.60	4

	Invoice	StockCode	Description	Quantity	InvoiceDate	Price	Customer ID	Country	Spend	Spend_Quartile	Recency_Days	Recency_Quartile
541909	581587	POST	POSTAGE	1	2011-12-09 12:50:00+00:00	18.00	12680.0	France	18.00	4	22	4
540814	581492	22357	KINGS CHOICE BISCUIT TIN	2	2011-12-09 10:03:00+00:00	4.13	NaN	United Kingdom	8.26	2	22	4
540815	581492	22358	KINGS CHOICE TEA CADDY	1	2011-12-09 10:03:00+00:00	5.79	NaN	United Kingdom	5.79	2	22	4
540816	581492	22361	GLASS JAR DAISY FRESH COTTON WOOL	2	2011-12-09 10:03:00+00:00	5.79	NaN	United Kingdom	11.58	3	22	4
540817	581492	22362	GLASS JAR PEACOCK BATH SALTS	2	2011-12-09 10:03:00+00:00	5.79	NaN	United Kingdom	11.58	3	22	4
...	...	...	...	...	...	...	...	...	...	...	...	...
2144	489597	22132	RED LOVE HEART SHAPE CUP	6	2009-12-01 14:28:00+00:00	1.70	NaN	United Kingdom	10.20	3	760	1
2143	489597	22130	PARTY CONE CHRISTMAS DECORATION	1	2009-12-01 14:28:00+00:00	8.65	NaN	United Kingdom	8.65	2	760	1
2142	489597	22122	SET OF 2 FANCY FONT TEA TOWELS	1	2009-12-01 14:28:00+00:00	6.04	NaN	United Kingdom	6.04	2	760	1
2152	489597	22143	CHRISTMAS CRAFT HEART DECORATIONS	2	2009-12-01 14:28:00+00:00	4.30	NaN	United Kingdom	8.60	2	760	1
0	489434	85048	15CM CHRISTMAS GLASS BALL 20 LIGHTS	12	2009-12-01 07:45:00+00:00	6.95	13085.0	United Kingdom	83.40	4	760	1

	Invoice	StockCode	Description	Quantity	InvoiceDate	Price	Customer ID	Country	Spend	Spend_Quartile	Recency_Days	Recency_Quartile
541909	581587	POST	POSTAGE	1	2011-12-09 12:50:00+00:00	18.00	12680.0	France	18.00	4	22	Active
540814	581492	22357	KINGS CHOICE BISCUIT TIN	2	2011-12-09 10:03:00+00:00	4.13	NaN	United Kingdom	8.26	2	22	Active
540815	581492	22358	KINGS CHOICE TEA CADDY	1	2011-12-09 10:03:00+00:00	5.79	NaN	United Kingdom	5.79	2	22	Active
540816	581492	22361	GLASS JAR DAISY FRESH COTTON WOOL	2	2011-12-09 10:03:00+00:00	5.79	NaN	United Kingdom	11.58	3	22	Active
540817	581492	22362	GLASS JAR PEACOCK BATH SALTS	2	2011-12-09 10:03:00+00:00	5.79	NaN	United Kingdom	11.58	3	22	Active
...	...	...	...	...	...	...	...	...	...	...	...	...
2144	489597	22132	RED LOVE HEART SHAPE CUP	6	2009-12-01 14:28:00+00:00	1.70	NaN	United Kingdom	10.20	3	760	Churned
2143	489597	22130	PARTY CONE CHRISTMAS DECORATION	1	2009-12-01 14:28:00+00:00	8.65	NaN	United Kingdom	8.65	2	760	Churned
2142	489597	22122	SET OF 2 FANCY FONT TEA TOWELS	1	2009-12-01 14:28:00+00:00	6.04	NaN	United Kingdom	6.04	2	760	Churned
2152	489597	22143	CHRISTMAS CRAFT HEART DECORATIONS	2	2009-12-01 14:28:00+00:00	4.30	NaN	United Kingdom	8.60	2	760	Churned
0	489434	85048	15CM CHRISTMAS GLASS BALL 20 LIGHTS	12	2009-12-01 07:45:00+00:00	6.95	13085.0	United Kingdom	83.40	4	760	Churned

	Recency	Frequency	MonetaryValue
	mean	mean	mean	count
RFM_Score
3	575.7	9.3	167.2	576
4	403.9	17.4	260.7	568
5	333.2	25.1	436.4	611
6	252.2	33.7	880.9	628
7	219.5	53.8	1284.3	633
8	167.8	75.8	1591.4	636
9	117.4	113.9	2711.7	546
10	88.6	181.0	3358.0	577
11	56.6	286.8	5295.1	539
12	30.9	587.8	15409.4	628

	Recency	Frequency	MonetaryValue
	mean	mean	mean	count
General_Segment
Bronze	490.4	13.3	213.6	1144
Gold	72.1	301.5	6964.8	2290
Silver	242.3	47.4	1054.6	2508

Recency, Frequency, and Monetary Value (RFM) Segmentatation & Analysis¶

Introduction and Data Comments¶

Importing, Cleaning, and Setup¶

Recency, Frequency, and Monetary Value (RFM)¶

Calculating RFM Metrics¶

Building RFM Segments in new DataFrame¶

Analyzing RFM Segments¶

General Analysis¶

	Recency	Frequency	MonetaryValue	R	F	M	RFM_Segment	RFM_Score	General_Segment
Customer ID
18102.0	22	1068	619428.08	4	4	4	444	12	Gold
14646.0	23	3890	533862.97	4	4	4	444	12	Gold
14156.0	31	4130	331328.05	4	4	4	444	12	Gold
14911.0	23	11613	321696.73	4	4	4	444	12	Gold
17450.0	30	448	260366.79	4	4	4	444	12	Gold