更新時間:2019-08-30 來源:黑馬程序員 瀏覽量:
1. 前置知識
什么是黑色星期五?
黑色星期五可以簡單理解為國外的雙十一,是指十一月第四個星期五,各大商場都會推出大量的打折和優惠活動的日子。
數據EDA 的研究目的是什么?
數據分析不是為了分析而分析,而是要通過數據分析來達到某種目的。對黑色星期五銷售數據進行分析,是希望通過數據分析來更好地了解客戶購買行為。同時可以為算法建模提供好數據支持。
項目簡介
黑色星期五數據BlackFriday 探索性分析EDA該數據集包括從零售商店獲得的銷售交易數據。這是一個幫我們探索和擴展特征工程技術和逐漸了解多角度購物經驗的經典數據集。數據集有537576 行12 列。
數據集
見文件數據集有537576 行12 列
環境需求
Anaconda2 + pycharm + numpy + pandas + matplotlib + scikitlearn + RF
運行結果
代碼實現:
#TODO: BlackFriday EDA #關于一家零售店黑色星期五的55 萬次觀測數據 #它包含不同類型的變量,無論是數值變量還是類別型變量 # todo:1.Libraies # 我們將會使用Pandas,Numpy,Seaborn 和Matplotlib 庫進行分析 #Warnings import warnings warnings.filterwarnings("ignore") import pandas as pd import numpy as np #可視化 import seaborn as sns import matplotlib.pyplot as plt import os # print os.listdir("E:\Python\BlackFriday") # To return the which files list containdec sns.set(style="darkgrid") # plt.rcParams["patch.force_edgecolor"]= True # matplotlib 中rcParams 主要用來設置圖像像素,畫圖的分辨率,大小等信息 # patch.force_edgecolor 打開全球的邊緣 #TODO:數據加載與特征提取 df=pd.read_csv("BlackFriday.csv") # print df.head(2) # User_ID Product_ID Gender Age Occupation City_Category \ # 0 1000001 P00069042 F 0-17 10 A # 1 1000001 P00248942 F 0-17 10 A # # Stay_In_Current_City_Years Marital_Status Product_Category_1 \ # 0 2 0 3 # 1 2 0 1 # # Product_Category_2 Product_Category_3 Purchase # 0 NaN NaN 8370 # 1 6.0 14.0 15200 # print df.info() # print df.shape # RangeIndex: 537577 entries, 0 to 537576 # Data columns (total 12 columns): # User_ID 537577 non-null int64 # Product_ID 537577 non-null object # Gender 537577 non-null object # Age 537577 non-null object # Occupation 537577 non-null int64 # City_Category 537577 non-null object # Stay_In_Current_City_Years 537577 non-null object # Marital_Status 537577 non-null int64 # Product_Category_1 537577 non-null int64 # Product_Category_2 370591 non-null float64 # Product_Category_3 164278 non-null float64 # Purchase 537577 non-null int64 # dtypes: float64(2), int64(5), object(5) # memory usage: 49.2+ MB # None # (537577, 12) #TODO: 缺失值的處理 total_miss=df.isnull().sum() #對應特征缺失值總數 # print total_miss # User_ID 0 # Product_ID 0 # Gender 0 # Age 0 # Occupation 0 # City_Category 0 # Stay_In_Current_City_Years 0 # Marital_Status 0 # Product_Category_1 0 # Product_Category_2 166986 # Product_Category_3 373299 # Purchase 0 # dtype: int64 # print total_miss per_miss= total_miss/df.isnull().count() # 每列對應特征 的nan 數/ 所有特征nan 乘以100 對應特征的缺失比值 # print total_miss/df.isnull().count()*100 # 每列對應特征的 nan 數/ 所有特征nan 乘以100 # User_ID 0.000000 # Product_ID 0.000000 # Gender 0.000000 # Age 0.000000 # Occupation 0.000000 # City_Category 0.000000 # Stay_In_Current_City_Years 0.000000 # Marital_Status 0.000000 # Product_Category_1 0.000000 # Product_Category_2 31.062713 # Product_Category_3 69.441029 # Purchase 0.000000 # dtype: float64 missing_data = pd.DataFrame({'Total missing':total_miss, '% missing':per_miss}) # print missing_data # print missing_data.sort_values(by='Total missing',ascending=False).head(3) # % missing Total missing # Product_Category_3 0.694410 373299 # Product_Category_2 0.310627 166986 # User_ID 0.000000 0 #由于大多數產品只屬于一個類別,所以少一些產品有第二個類別是 有意義的,更不用說第三個類別了。 # TODO :唯一值 #探討數據中特征中的唯一值。總共有537577 # print "Unique Values for Each Feature:\n" # print df.columns # Index([u'User_ID', u'Product_ID', u'Gender', u'Age', u'Occupation', # u'City_Category', u'Stay_In_Current_City_Years', u'Marital_Status', # u'Product_Category_1', u'Product_Category_2', u'Product_Category_3', # u'Purchase'], # dtype='object') # for i in df.columns: # i 對應就是columns 中的每一列 # print i,':',df[i].unique() #todo:關于產品信息 # print "Number of products:",df['Product_ID'].unique() # print "Number of products Numbers:",len(df['Product_ID'].unique().tolist()) # print "Number of categories:",df["Product_Category_1"].unique().max() # print "Highest and lowest purchase:",df['Purchase'].max(),',',df['Purchase'].min() # Number of products: ['P00069042' 'P00248942' 'P00087842' ..., 'P00038842' 'P00295642' # 'P00091742'] # Number of products Numbers: 3623 # Number of categories: 18 # Highest and lowest purchase: 23961 , 185 #todo:關于買家信息 # print "Number of shoppers:",df['User_ID'].unique() # print "Shoppers 數 量:",len(df['User_ID'].unique().tolist()) # print "Years in city:",df['Stay_In_Current_City_Years'].unique() # print "Age Group:",df['Age'].unique() # Number of shoppers: [1000001 1000002 1000003 ..., 1004113 1005391 1001529] # Shoppers 數量: 5891 # Years in city: ['2' '4+' '3' '1' '0'] # Age Group: ['0-17' '55+' '26-35' '46-50' '51-55' '36-45' '18-25'] #TODO:Gender #首先,通過查看每個條目的數量來確定數據是否按性別均勻分布; count_m=df[df['Gender']=='M'] #在DataFrame 中選擇性別為M 的所有數據 # print df['Gender']=='M' # print count_m count_m_count=df[df['Gender']=='M'].count() count_m_count=df[df['Gender']=='M'].count()[0] # print count_m_count # User_ID 405380 # Product_ID 405380 # Gender 405380 # Age 405380 # Occupation 405380 # City_Category 405380 # Stay_In_Current_City_Years 405380 # Marital_Status 405380 # Product_Category_1 405380 # Product_Category_2 280741 # Product_Category_3 127346 # Purchase 405380 # dtype: int64 # User_ID Product_ID Gender Age Occupation City_Category \ # 4 1000002 P00285442 M 55+ 16 C # 5 1000003 P00193542 M 26-35 15 A # 6 1000004 P00184942 M 46-50 7 B # 7 1000004 P00346142 M 46-50 7 B # 8 1000004 P0097242 M 46-50 7 B # 9 1000005 P00274942 M 26-35 20 A # print "count_m_count=df[df['Gender']=='M'].count()[0]:",df[df['Ge nder']=='M'].count()[0] #count_m_count=df[df['Gender']=='M'].count()[0]: 405380 count_f = df[df['Gender']=='F'].count()[0] print "Number of male clients:",count_m_count print "Number of female clients:",count_f # 男客戶數量: 405380 # 女客戶數量: 132197 #我們可以看到記錄的男性客戶數量超過4 次記錄的女性客戶數量。 因此,通過使用比率而不是計算每條數據來分析性別將更加信息化。 讓我們看看每個性別的多少每個人均消費 print "Female Purchases:",round(df[df["Gender"]=='F']['Purchase'].sum()/c ount_f) print "Male Purchases:",round(df[df["Gender"]=='M']['Purchase'].sum()/c ount_m_count) # Female Purchases: 8809.0 女性人均消費 # Male Purchases: 9504.0 男性分均消費 #圖形繪制 # nunique() Return number of unique elements in the object. plt.pie(df.groupby('Gender')['Product_ID'].nunique(),labels =['Male','Female'], shadow=True, autopct='%1.1f%%',colors=['steelblue','cornflowerblue']) plt.title('Unique Item Purchases by Gender') plt.show() print df.groupby('Gender')['Product_ID'].nunique() # F 3358 # M 3582 # Name: Product_ID, dtype: int64 #雖然差不多,但女性確實購買了比男性更多的產品。現在,讓我們 根據產品類別分析每個性別購買的比例。 #按照性別進行分組 print "====================" # print df[df['Gender'] == 'M'] # User_ID Product_ID ... Product_Category_3 Purchase # 4 1000002 P00285442 ... NaN 7969 # 5 1000003 P00193542 ... NaN 15227 # 6 1000004 P00184942 ... 17.0 19215 # 7 1000004 P00346142 ... NaN 15854 gender = df[df['Gender'] == 'M'][['Product_Category_1', 'Gender']] # 針對過濾為男性用戶的數據進行選取 Product_Category_1 Gender # print gender # Product_Category_1 Gender # 4 8 M # 5 1 M # 6 1 M #todo:品類1 中的男性用戶數量 gb_gender_m = df[df['Gender'] == 'M'][['Product_Category_1', 'Gender']].count() print "gb_gender_m:\n",gb_gender_m gb_gener_f = df[df['Gender'] == 'F'][['Product_Category_1', 'Gender']].count() # print "gb_gender_f:\n",gb_gener_f # gb_gender_m: # Product_Category_1 405380 # Gender 405380 # dtype: int64 # gb_gender_f: # Product_Category_1 132197 # Gender 132197 #連接并更改列名稱 print "---------------------------------" cat_bygender=pd.concat([gb_gender_m,gb_gener_f],axis=1) # print cat_bygender # 0 1 # Product_Category_1 405380 132197 # Gender 405380 132197 cat_bygender.columns=['M ratio','F ratio'] # print cat_bygender # M ratio F ratio # Product_Category_1 405380 132197 # Gender 405380 132197 #調整以比率 cat_bygender['M ratio'] = cat_bygender['M ratio']/df[df['Gender']=='M'].count()[0] cat_bygender['F ratio'] = cat_bygender['F ratio']/df[df['Gender']=='F'].count()[0] # print df[df['Gender']=='M'].count() # User_ID 405380 # Product_ID 405380 # Gender 405380 # Age 405380 # Occupation 405380 # City_Category 405380 #Create likelihood of one gender to buy over the other cat_bygender['Likelihood (M/F)']=cat_bygender['M ratio']/cat_bygender['F ratio'] cat_bygender['Total Ratio'] = cat_bygender['M ratio']+cat_bygender['F ratio'] cat_bygender.sort_values(by='Likelihood (M/F)',ascending=False) # print cat_bygender #TODO:Age # Age 值是字符串,我們現在對每個組進行編碼,以便它們可以用機 器學習算法可以理解的整數值表示 #年齡組編碼 df['Age_Encoded'] = df['Age'].map({'0-17':0,'18-25':1, '26-35':2,'36-45':3, '46-50':4,'51-55':5, '55+':6}) prod_byage = df.groupby('Age').nunique()['Product_ID'] fig,ax = plt.subplots(1,2,figsize=(14,6)) ax = ax.ravel() sns.countplot(df['Age'].sort_values(),ax=ax[0], palette="Blues_d") ax[0].set_xlabel('Age Group') ax[0].set_title('Age Group Distribution') sns.barplot(x=prod_byage.index,y=prod_byage.values,ax=ax[1] , palette="Blues_d") ax[1].set_xlabel('Age Group') ax[1].set_title('Unique Products by Age') plt.show() #很明顯,客戶中最大的年齡組是26-35 歲。有趣的是,就數量而言, 產品購買的分布在各年齡組之間差異不大。這意味著,雖然26-35 歲年齡組是最受歡迎的,但其他年齡組購買的幾乎與他們一樣多。但 這是否意味著在年齡組中花費的金額是相同的?讓我們來看一下 spent_byage = df.groupby(by='Age').sum()['Purchase'] plt.figure(figsize=(12,6)) sns.barplot(x=spent_byage.index,y=spent_byage.values, palette="Blues_d") plt.title('Mean Purchases per Age Group') plt.show() #我們的數據清楚地表明,每個年齡組的資金數額與年齡組內的客戶 數量成正比。這可能是商店的有價值的信息,因為它可能希望在未 來添加更多針對該年齡組的產品,或者可能致力于營銷不同的項目以 增加其客戶年齡組的更廣泛的多樣性。 #TODO: Occupation # This sections draws some insights on our data in terms of the occupation of the customers. plt.figure(figsize=(12,6)) sns.countplot(df['Occupation']) plt.title('Occupation Distribution') plt.show() plt.figure(figsize=(12,6)) prod_by_occ = df.groupby(by='Occupation').nunique()['Product_ID'] sns.barplot(x=prod_by_occ.index,y=prod_by_occ.values) plt.title('Unique Products by Occupation') plt.show() spent_by_occ = df.groupby(by='Occupation').sum()['Purchase'] plt.figure(figsize=(12,6)) sns.barplot(x=spent_by_occ.index,y=spent_by_occ.values) plt.title('Total Money Spent per Occupation') plt.show() #再一次,每個職業所花費的平均金額的分配似乎反映了每個職業中 人數的分布。從數據科學的角度來看,這是幸運的,因為我們沒有 使用奇怪或突出的功能。我們在年齡和職業方面的數據似乎很有意 義。 #TODO: Products #這里,我們探索產品本身。這很重要,因為我們在此數據集中沒有 標記的項目。從理論上講,客戶可以在4 臺新電視或10,000 支筆上 花費5,000 美元。這種差異對商店很重要,因為他們的利潤受到影 響。由于我們不知道這些項目是什么,讓我們探索項目的類別。 plt.figure(figsize=(12,6)) prod_by_cat = df.groupby('Product_Category_1')['Product_ID'].nunique() sns.barplot(x=prod_by_cat.index,y=prod_by_cat.values, palette="Blues_d") plt.title('Number of Unique Items per Category') plt.show() #類別標簽1,5 和8 顯然具有最多的項目。這可能意味著商店以該項 目而聞名,或者該類別是廣泛的。 category = [] mean_purchase = [] for i in df['Product_Category_1'].unique(): category.append(i) category.sort() for e in category: mean_purchase.append(df[df['Product_Category_1']==e]['Purch ase'].mean()) plt.figure(figsize=(12,6)) sns.barplot(x=category,y=mean_purchase) plt.title('Mean of the Purchases per Category') plt.xlabel('Product Category') plt.ylabel('Mean Purchase') plt.show() #有趣的是,我們最受歡迎的類別并不是那些賺錢最多的類別。這似 乎是一個大商店,他們可能會意識到這一點。然而,對于可能不知 道的較小商店的情況,可以使用相同形式的分析,并且它可能非常有 用。 #TODO: Estimate of price and quantity of purchase #由于“購買”功能暗示客戶為某個項目的未知金額支付了多少錢, 因此我們大膽假設產品支付的最低購買金額是所述商品的價格 #最低購買量的產品 prod_prices = df.groupby('Product_ID').min()['Purchase'].to_dict() #現在,每個商品ID 的購買價值按人們可以購買的商品數量進行分 組。然后,下面計算的價格和數量是一個估計,但它將是一個非常 好的 def find_price(row): prod = row['Product_ID'] return prod_prices[prod] df['Price'] = df.apply(find_price,axis=1) df['Amount'] = round(df['Purchase']/df['Price']).astype(int)