1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374 |
- # -*- coding: utf-8 -*-
- # 导入必要的库
- import pandas as pd
- from catboost import CatBoostClassifier
- from matplotlib import pyplot as plt
- from sklearn.model_selection import train_test_split
- # 加载示例数据集
- data = pd.read_excel('/Users/alvin/Downloads/ai_v3_bill_sample02.xlsx')
- # 获取除了risk_buss_no之外的所有列,并转为数字
- data = data.drop('risk_buss_no', axis=1)
- data = data.apply(pd.to_numeric, errors='coerce')
- print(data.columns)
- X = pd.get_dummies(data.drop('y', axis=1))
- y = data['y']
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
- # 定义CatBoost分类器并训练模型
- # n_estimators 78
- # max_depth 2
- # learning_rate 0.05
- # l2_leaf_reg 17
- # subsample 0.7
- # max_leaves 28
- # min_data_in_leaf 2
- # grow_policy Lossguide
- # model = CatBoostClassifier()
- model = CatBoostClassifier(n_estimators=78, max_depth=2, learning_rate=0.05, l2_leaf_reg=17, subsample=0.7, max_leaves=28, min_data_in_leaf=2, grow_policy='Lossguide')
- model.fit(X_train, y_train, verbose=False)
- # 评估模型性能
- print('Train accuracy:', model.score(X_train, y_train))
- print('Test accuracy:', model.score(X_test, y_test))
- # 保存模型
- # model.save_model('catboost_model.bin')
- # 加载模型
- # loaded_model = CatBoostClassifier()
- # loaded_model.load_model('catboost_model.bin')
- #
- # 使用模型进行预测
- preds_class = model.predict(X_test)
- preds_proba = model.predict_proba(X_test)
- # 输出预测结果
- print('Predicted classes:', preds_class)
- print('Predicted probabilities:', preds_proba)
- # 获取 preds_proba 的第一列,打印最大值、最小值
- print(preds_proba[:, 0].max())
- print(preds_proba[:, 0].min())
- #
- # from sklearn.metrics import classification_report, confusion_matrix
- # from sklearn.metrics import precision_recall_curve, roc_curve, auc
- #
- # print(classification_report(y_test, model.predict(X_test)))
- # confusion = confusion_matrix(y_test, model.predict(X_test), normalize='all')
- # print(confusion)
- # # AUC ROC Curve plotting
- # probs = model.predict_proba(X_test)
- # preds = probs[:, 1]
- # fpr, tpr, threshold = roc_curve(y_test, preds)
- # roc_auc = auc(fpr, tpr)
- #
- # # plt.figure(figsize = (12, 7))
- # plt.title('Receiver Operating Characteristic', weight='bold')
- # plt.plot(fpr, tpr, 'b', label='XGBClassifier (AUC = %0.2f)' % roc_auc)
- # plt.legend(loc='lower right')
- # plt.plot([0, 1], [0, 1], 'r--')
- # plt.ylabel('True Positive Rate', fontsize=12)
- # plt.xlabel('False Positive Rate', fontsize=12)
- # plt.show()
|