阅读 158

Python机器学习应用之工业蒸汽数据分析篇详解

本篇文章介绍了如何用Python进行工业蒸汽数据分析的过程及思路,通读本篇对大家的学习或工作具有一定的价值,需要的朋友可以参考下

目录
  • 一、数据集

  • 二、数据分析

    • 1 数据导入

    • 2 数据特征探索(数据可视化)

  • 三、特征优化

    • 四、对特征构造后的训练集和测试集进行主成分分析

      • 五、使用LightGBM模型进行训练和预测

        一、数据集

        1. 训练集 提取码:1234

        2. 测试集 提取码:1234

        二、数据分析

        1 数据导入

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        #%%导入基础包
        import numpy as np
        import pandas as pd
        import matplotlib.pyplot as plt
        import seaborn as sns
        from scipy import stats
        import warnings
        warnings.filterwarnings("ignore")
        #%%读取数据
        train_data_file = "D:\Python\ML\data\zhengqi_train.txt"
        test_data_file =  "D:\Python\ML\data\/zhengqi_test.txt"
        train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
        test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
        #%%查看训练集特征变量信息
        train_infor=train_data.describe()
        test_infor=test_data.describe()

        2 数据特征探索(数据可视化)

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        21
        22
        23
        24
        25
        26
        27
        28
        29
        30
        31
        32
        33
        34
        35
        36
        37
        #%%可视化探索数据
        # 画v0箱式图
        fig = plt.figure(figsize=(4, 6))  # 指定绘图对象宽度和高度
        sns.boxplot(y=train_data['V0'],orient="v", width=0.5)
        #%%可以将所有的特征都画出
        '''
        column = train_data.columns.tolist()[:39]  # 列表头
        fig = plt.figure(figsize=(20, 40))  # 指定绘图对象宽度和高度
        for i in range(38):
            plt.subplot(13, 3, i + 1)  # 13行3列子图
            sns.boxplot(train_data[column[i]], orient="v", width=0.5)  # 箱式图
            plt.ylabel(column[i], fontsize=8)
        plt.show()
        '''
        #%%查看v0的数据分布直方图,绘制QQ图查看数据是否近似于正态分布
        plt.figure(figsize=(10,5))
        ax=plt.subplot(1,2,1)
        sns.distplot(train_data['V0'],fit=stats.norm)
        ax=plt.subplot(1,2,2)
        res = stats.probplot(train_data['V0'], plot=plt)
        #%%查看所有特征的数据分布情况
        '''
        train_cols = 6
        train_rows = len(train_data.columns)
        plt.figure(figsize=(4*train_cols,4*train_rows))
         
        i=0
        for col in train_data.columns:
            i+=1
            ax=plt.subplot(train_rows,train_cols,i)
            sns.distplot(train_data[col],fit=stats.norm)
             
            i+=1
            ax=plt.subplot(train_rows,train_cols,i)
            res = stats.probplot(train_data[col], plot=plt)
        plt.show()
        '''

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        21
        22
        23
        24
        25
        #%%对比统一特征训练集和测试集的分布情况,查看数据分布是否一致
        ax = sns.kdeplot(train_data['V0'], color="Red", shade=True)
        ax = sns.kdeplot(test_data['V0'], color="Blue", shade=True)
        ax.set_xlabel('V0')
        ax.set_ylabel("Frequency")
        ax = ax.legend(["train","test"])
         
        #%%查看所有特征的训练集和测试集分布情况
        '''
        dist_cols = 6
        dist_rows = len(test_data.columns)
        plt.figure(figsize=(4*dist_cols,4*dist_rows))
         
        i=1
        for col in test_data.columns:
            ax=plt.subplot(dist_rows,dist_cols,i)
            ax = sns.kdeplot(train_data[col], color="Red", shade=True)
            ax = sns.kdeplot(test_data[col], color="Blue", shade=True)
            ax.set_xlabel(col)
            ax.set_ylabel("Frequency")
            ax = ax.legend(["train","test"])
             
            i+=1
        plt.show()
        '''

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        #%%查看v5,v9,v11,v22,v28的数据分布
        drop_col = 6
        drop_row = 1
         
        plt.figure(figsize=(5*drop_col,5*drop_row))
        i=1
        for col in ["V5","V9","V11","V17","V22","V28"]:
            ax =plt.subplot(drop_row,drop_col,i)
            ax = sns.kdeplot(train_data[col], color="Red", shade=True)
            ax = sns.kdeplot(test_data[col], color="Blue", shade=True)
            ax.set_xlabel(col)
            ax.set_ylabel("Frequency")
            ax = ax.legend(["train","test"])
             
            i+=1
        plt.show()
        #%%删除这些特征
        drop_columns=["V5","V9","V11","V17","V22","V28"]
        train_data=train_data.drop(columns=drop_columns)
        test_data=test_data.drop(columns=drop_columns)

        当训练数据和测试数据分布不一致的时候,会导致模型的泛化能力差,采用删除此类特征的方法

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        21
        22
        23
        24
        25
        26
        27
        28
        29
        30
        31
        32
        33
        34
        35
        36
        37
        #%%可视化线性回归关系
        fcols = 2
        frows = 1
        plt.figure(figsize=(8,4))
        ax=plt.subplot(1,2,1)
        sns.regplot(x='V0', y='target', data=train_data, ax=ax,
                    scatter_kws={'marker':'.','s':3,'alpha':0.3},
                    line_kws={'color':'k'});
        plt.xlabel('V0')
        plt.ylabel('target')
         
        ax=plt.subplot(1,2,2)
        sns.distplot(train_data['V0'].dropna())
        plt.xlabel('V0')
         
        plt.show()
        #%%查看所有特征变量与target变量的线性回归关系
        '''
        fcols = 6
        frows = len(test_data.columns)
        plt.figure(figsize=(5*fcols,4*frows))
         
        i=0
        for col in test_data.columns:
            i+=1
            ax=plt.subplot(frows,fcols,i)
            sns.regplot(x=col, y='target', data=train_data, ax=ax,
                        scatter_kws={'marker':'.','s':3,'alpha':0.3},
                        line_kws={'color':'k'});
            plt.xlabel(col)
            plt.ylabel('target')
             
            i+=1
            ax=plt.subplot(frows,fcols,i)
            sns.distplot(train_data[col].dropna())
            plt.xlabel(col)
        '''


        1
        2
        3
        4
        5
        #%%查看特征变量的相关性
        train_corr = train_data.corr()
        # 画出相关性热力图
        ax = plt.subplots(figsize=(20, 16))#调整画布大小
        ax = sns.heatmap(train_corr, vmax=.8, square=True, annot=True)#画热力图   annot=True 显示系数

        1
        2
        3
        4
        5
        6
        7
        8
        9
        #%%找出相关程度
        plt.figure(figsize=(20, 16))  # 指定绘图对象宽度和高度
        colnm = train_data.columns.tolist()  # 列表头
        mcorr = train_data[colnm].corr(method="spearman"# 相关系数矩阵,即给出了任意两个变量之间的相关系数
        mask = np.zeros_like(mcorr, dtype=np.bool# 构造与mcorr同维数矩阵 为bool型
        mask[np.triu_indices_from(mask)] = True  # 角分线右侧为True
        cmap = sns.diverging_palette(220, 10, as_cmap=True# 返回matplotlib colormap对象
        g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f'# 热力图(看两两相似度)
        plt.show()

        1
        2
        3
        4
        5
        6
        7
        8
        9
        #%%查找特征变量和target变量相关系数大于0.5的特征变量
        #寻找K个最相关的特征信息
        k = 10 # number of variables for heatmap
        cols = train_corr.nlargest(k, 'target')['target'].index
         
        cm = np.corrcoef(train_data[cols].values.T)
        hm = plt.subplots(figsize=(10, 10))#调整画布大小
        hm = sns.heatmap(train_data[cols].corr(),annot=True,square=True)
        plt.show()

        1
        2
        3
        4
        5
        threshold = 0.5
        corrmat = train_data.corr()
        top_corr_features = corrmat.index[abs(corrmat["target"])>threshold]
        plt.figure(figsize=(10,10))
        g = sns.heatmap(train_data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        21
        22
        23
        24
        25
        26
        27
        #%% Threshold for removing correlated variables
        threshold = 0.05
         
        # Absolute value correlation matrix
        corr_matrix = train_data.corr().abs()
        drop_col=corr_matrix[corr_matrix["target"]<threshold].index
        #%%删除相关性小于0.05的列
        train_data=train_data.drop(columns=drop_col)
        test_data=test_data.drop(columns=drop_col)
         
        #%%将train和test合并
        train_x=train_data.drop(['target'],axis=1)
        data_all=pd.concat([train_x,test_data])
         
        #%%标准化
        cols_numeric=list(data_all.columns)
         
        def scale_minmax(col):
            return (col-col.min())/(col.max()-col.min())
         
        data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax,axis=0)
        print(data_all[cols_numeric].describe())
        train_data_process = train_data[cols_numeric]
        train_data_process = train_data_process[cols_numeric].apply(scale_minmax,axis=0)
         
        test_data_process = test_data[cols_numeric]
        test_data_process = test_data_process[cols_numeric].apply(scale_minmax,axis=0)

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        21
        22
        23
        24
        25
        26
        27
        28
        29
        30
        31
        32
        33
        34
        35
        36
        37
        38
        39
        40
        41
        42
        43
        44
        45
        46
        47
        48
        49
        50
        51
        #%%查看v0-v3四个特征的箱盒图,查看其分布是否符合正态分布
        cols_numeric_0to4 = cols_numeric[0:4]
        ## Check effect of Box-Cox transforms on distributions of continuous variables
         
        train_data_process = pd.concat([train_data_process, train_data['target']], axis=1)
         
        fcols = 6
        frows = len(cols_numeric_0to4)
        plt.figure(figsize=(4*fcols,4*frows))
        i=0
         
        for var in cols_numeric_0to4:
            dat = train_data_process[[var, 'target']].dropna()
                 
            i+=1
            plt.subplot(frows,fcols,i)
            sns.distplot(dat[var] , fit=stats.norm);
            plt.title(var+' Original')
            plt.xlabel('')
                 
            i+=1
            plt.subplot(frows,fcols,i)
            _=stats.probplot(dat[var], plot=plt)
            plt.title('skew='+'{:.4f}'.format(stats.skew(dat[var])))
            plt.xlabel('')
            plt.ylabel('')
                 
            i+=1
            plt.subplot(frows,fcols,i)
            plt.plot(dat[var], dat['target'],'.',alpha=0.5)
            plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[var], dat['target'])[0][1]))
          
            i+=1
            plt.subplot(frows,fcols,i)
            trans_var, lambda_var = stats.boxcox(dat[var].dropna()+1)
            trans_var = scale_minmax(trans_var)     
            sns.distplot(trans_var , fit=stats.norm);
            plt.title(var+' Tramsformed')
            plt.xlabel('')
                 
            i+=1
            plt.subplot(frows,fcols,i)
            _=stats.probplot(trans_var, plot=plt)
            plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
            plt.xlabel('')
            plt.ylabel('')
                 
            i+=1
            plt.subplot(frows,fcols,i)
            plt.plot(trans_var, dat['target'],'.',alpha=0.5)
            plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))

        三、特征优化

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        21
        22
        23
        24
        25
        26
        27
        28
        29
        30
        31
        import pandas as pd
         
        train_data_file =  "D:\Python\ML\data\zhengqi_train.txt"
        test_data_file =   "D:\Python\ML\data\zhengqi_test.txt"
         
        train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
        test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
         
        #%%定义特征构造方法,构造特征
        epsilon=1e-5
         
        #组交叉特征,可以自行定义,如增加: x*x/y, log(x)/y 等等,使用lambda函数更方便快捷
        func_dict = {
                    'add': lambda x,y: x+y,
                    'mins': lambda x,y: x-y,
                    'div': lambda x,y: x/(y+epsilon),
                    'multi': lambda x,y: x*y
                    }
        #%%定义特征构造函数
        def auto_features_make(train_data,test_data,func_dict,col_list):
            train_data, test_data = train_data.copy(), test_data.copy()
            for col_i in col_list:
                for col_j in col_list:
                    for func_name, func in func_dict.items():
                        for data in [train_data,test_data]:
                            func_features = func(data[col_i],data[col_j])
                            col_func_features = '-'.join([col_i,func_name,col_j])
                            data[col_func_features] = func_features
            return train_data,test_data
        #%%对训练集和测试集进行特征构造
        train_data2, test_data2 = auto_features_make(train_data,test_data,func_dict,col_list=test_data.columns)

        四、对特征构造后的训练集和测试集进行主成分分析

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        #%%PCA
        from sklearn.decomposition import PCA   #主成分分析法
         
        #PCA方法降维
        pca = PCA(n_components=500)
        train_data2_pca = pca.fit_transform(train_data2.iloc[:,0:-1])
        test_data2_pca = pca.transform(test_data2)
        train_data2_pca = pd.DataFrame(train_data2_pca)
        test_data2_pca = pd.DataFrame(test_data2_pca)
        train_data2_pca['target'] = train_data2['target']
        X_train2 = train_data2[test_data2.columns].values
        y_train = train_data2['target']

        五、使用LightGBM模型进行训练和预测

        1
        2
        3
        4
        5
        6
        7
        8
        9
        10
        11
        12
        13
        14
        15
        16
        17
        18
        19
        20
        21
        22
        23
        24
        25
        26
        27
        28
        29
        30
        31
        32
        33
        34
        35
        36
        37
        38
        39
        40
        41
        42
        43
        44
        45
        46
        47
        48
        49
        50
        51
        52
        53
        54
        55
        56
        #%%使用lightgbm模型对新构造的特征进行模型训练和评估
        from sklearn.model_selection import KFold
        from sklearn.metrics import mean_squared_error
        import lightgbm as lgb
        import numpy as np
         
        # 5折交叉验证
        kf = KFold(len(X_train2), shuffle=True, random_state=2019)
        #%%
        # 记录训练和预测MSE
        MSE_DICT = {
            'train_mse':[],
            'test_mse':[]
        }
         
        # 线下训练预测
        for i, (train_index, test_index) in enumerate(kf.split(X_train2)):
            # lgb树模型
            lgb_reg = lgb.LGBMRegressor(
                learning_rate=0.01,
                max_depth=-1,
                n_estimators=5000,
                boosting_type='gbdt',
                random_state=2019,
                objective='regression',
            )
            
            # 切分训练集和预测集
            X_train_KFold, X_test_KFold = X_train2[train_index], X_train2[test_index]
            y_train_KFold, y_test_KFold = y_train[train_index], y_train[test_index]
             
            # 训练模型
            lgb_reg.fit(
                    X=X_train_KFold,y=y_train_KFold,
                    eval_set=[(X_train_KFold, y_train_KFold),(X_test_KFold, y_test_KFold)],
                    eval_names=['Train','Test'],
                    early_stopping_rounds=100,
                    eval_metric='MSE',
                    verbose=50
                )
         
         
            # 训练集预测 测试集预测
            y_train_KFold_predict = lgb_reg.predict(X_train_KFold,num_iteration=lgb_reg.best_iteration_)
            y_test_KFold_predict = lgb_reg.predict(X_test_KFold,num_iteration=lgb_reg.best_iteration_)
             
            print('第{}折 训练和预测 训练MSE 预测MSE'.format(i))
            train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
            print('------\n', '训练MSE\n', train_mse, '\n------')
            test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
            print('------\n', '预测MSE\n', test_mse, '\n------\n')
             
            MSE_DICT['train_mse'].append(train_mse)
            MSE_DICT['test_mse'].append(test_mse)
        print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n', np.mean(MSE_DICT['train_mse']), '\n------')
        print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n', np.mean(MSE_DICT['test_mse']), '\n------')

        ..... 不想等它跑完了,会一直跑到score不再变化或者round=100的时候为止~

        到此这篇关于Python机器学习应用之工业蒸汽数据分析篇详解的文章就介绍到这了

        原文链接:https://blog.csdn.net/qq_43368987/article/details/122486905?spm=1001.2014.3001.5501


        文章分类
        代码人生
        版权声明:本站是系统测试站点,无实际运营。本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至 XXXXXXo@163.com 举报,一经查实,本站将立刻删除。
        相关推荐