# 合并训练测试 train = pd.read_csv('./线下商店销量预测_数据集/train.csv') test = pd.read_csv('./线下商店销量预测_数据集/test.csv') df=pd.concat([train,test]).reset_index(drop=True) df=df.sort_values(['shop_id','item_id','week'])
# 用来做滑动和滞后特征的函数 defmakelag(data,values,shift): lags=[i+shift for i inrange(15)] rollings=[i for i inrange(2,15)] for lag in lags: data[f'lag_{lag}']=values.shift(lag) for rolling in rollings: data[f's_{shift}_roll_{rolling}_min']=values.shift(shift).rolling(window=rolling).min() data[f's_{shift}_roll_{rolling}_max']=values.shift(shift).rolling(window=rolling).max() data[f's_{shift}_roll_{rolling}_median']=values.shift(shift).rolling(window=rolling).median() data[f's_{shift}_roll_{rolling}_std']=values.shift(shift).rolling(window=rolling).std() data[f's_{shift}_roll_{rolling}_mean']=values.shift(shift).rolling(window=rolling).mean() return data
# 对每个item都做滞后和滑动特征 df=df.groupby(['shop_id','item_id']).apply(lambda x:makelag(x,x['weekly_sales'],1)) # 价格填充特征,先用前一个值填补,再向后填补,最后没填补到的用那个item的价格众数填补 df['item_price_fill']=df.groupby(['shop_id','item_id'])['item_price'].apply(lambda x: x.ffill().bfill()) df['item_price_fill']=df.groupby(['item_id'])['item_price_fill'].apply(lambda x: x.fillna(x.mode()[0])) # 对于每个shop,item,item_cat,shop*item_cat,shop*item分别做价格和销量的mean/std encoding, for func in ['mean','std']: df[f'shop_sale_{func}']=df.groupby(['shop_id'])['weekly_sales'].transform(func) df[f'category_sale_{func}']=df.groupby(['item_category_id'])['weekly_sales'].transform(func) df[f'item_sale_{func}']=df.groupby(['item_id'])['weekly_sales'].transform(func) df[f'shop_cat_sale_{func}']=df.groupby(['shop_id','item_category_id'])['weekly_sales'].transform(func) df[f'shop_item_sale_{func}']=df.groupby(['shop_id','item_id'])['weekly_sales'].transform(func) df[f'shop_price_{func}']=df.groupby(['shop_id'])['item_price'].transform(func) df[f'category_price_{func}']=df.groupby(['item_category_id'])['item_price'].transform(func) df[f'shop_cat_price_{func}']=df.groupby(['shop_id','item_category_id'])['item_price_fill'].transform(func) df[f'item_price_{func}']=df.groupby(['item_id'])['item_price'].transform(func) df[f'shop_item_price_{func}']=df.groupby(['shop_id','item_id'])['item_price_fill'].transform(func) # 价格差异特征,当前价格与shop、item、item_cat、shop_cat、shop_item的价格均值的差值 df['shop_price_diff']=df['shop_price_mean']-df['item_price_fill'] df['item_price_diff']=df['item_price_mean']-df['item_price_fill'] df['cat_price_diff']=df['category_price_mean']-df['item_price_fill'] df['shop_cat_price_diff']=df['shop_cat_price_mean']-df['item_price_fill'] df['shop_item_price_diff']=df['shop_item_price_mean']-df['item_price_fill'] # 当前价格与上周价格的差值,当前价格与上个月价格均值的差值 df['week_price_diff']=df.groupby(['shop_id','item_id'])['item_price_fill'].apply(lambda x: x-x.shift(1)) df['month_price_diff']=df.groupby(['shop_id','item_id'])['item_price_fill'].apply(lambda x: x-x.shift(1).rolling(4).mean()) # 销量的滞后特征,对于每个item、item_cat、shop的聚合平均值 for lag in [i for i inrange(1,16)]: df[f'item_lag_{lag}']=df.groupby(['item_id','week'])[f'lag_{lag}'].transform('mean') df[f'cat_lag_{lag}']=df.groupby(['item_category_id','week'])[f'lag_{lag}'].transform('mean') df[f'shop_lag_{lag}']=df.groupby(['shop_id','week'])[f'lag_{lag}'].transform('mean')