|
@@ -1,34 +1,74 @@
|
|
|
-# 创建Elasticsearch客户端
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# 导入必要的库
|
|
|
+import pandas as pd
|
|
|
+from catboost import CatBoostClassifier
|
|
|
+from matplotlib import pyplot as plt
|
|
|
+from sklearn.model_selection import train_test_split
|
|
|
|
|
|
-import requests
|
|
|
-import json
|
|
|
+# 加载示例数据集
|
|
|
+data = pd.read_excel('/Users/alvin/Downloads/ai_v3_bill_sample02.xlsx')
|
|
|
+# 获取除了risk_buss_no之外的所有列,并转为数字
|
|
|
+data = data.drop('risk_buss_no', axis=1)
|
|
|
+data = data.apply(pd.to_numeric, errors='coerce')
|
|
|
+print(data.columns)
|
|
|
+X = pd.get_dummies(data.drop('y', axis=1))
|
|
|
+y = data['y']
|
|
|
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
|
|
-# Elasticsearch地址
|
|
|
-base_url = 'http://elasticsearch-master.rxdpdev.svc.k5.bigtree.zone:9200'
|
|
|
+# 定义CatBoost分类器并训练模型
|
|
|
+# n_estimators 78
|
|
|
+# max_depth 2
|
|
|
+# learning_rate 0.05
|
|
|
+# l2_leaf_reg 17
|
|
|
+# subsample 0.7
|
|
|
+# max_leaves 28
|
|
|
+# min_data_in_leaf 2
|
|
|
+# grow_policy Lossguide
|
|
|
+# model = CatBoostClassifier()
|
|
|
+model = CatBoostClassifier(n_estimators=78, max_depth=2, learning_rate=0.05, l2_leaf_reg=17, subsample=0.7, max_leaves=28, min_data_in_leaf=2, grow_policy='Lossguide')
|
|
|
|
|
|
+model.fit(X_train, y_train, verbose=False)
|
|
|
|
|
|
-# 插入一条数据
|
|
|
-def insert_data(id, name):
|
|
|
- url = f'{base_url}/rxdp_tag_all/_doc/{id}'
|
|
|
- doc = {
|
|
|
- 'id': id,
|
|
|
- 'name': name
|
|
|
- }
|
|
|
- headers = {'Content-Type': 'application/json'}
|
|
|
- response = requests.put(url, data=json.dumps(doc), headers=headers)
|
|
|
- # print(response.json())
|
|
|
+# 评估模型性能
|
|
|
+print('Train accuracy:', model.score(X_train, y_train))
|
|
|
+print('Test accuracy:', model.score(X_test, y_test))
|
|
|
|
|
|
+# 保存模型
|
|
|
+# model.save_model('catboost_model.bin')
|
|
|
|
|
|
-# 根据ID查询数据
|
|
|
-def get_data_by_id(id):
|
|
|
- url = f'{base_url}/rxdp_tag_all/_doc/{id}'
|
|
|
- headers = {'Content-Type': 'application/json'}
|
|
|
- response = requests.get(url, headers=headers)
|
|
|
- print(response.json()['_source'])
|
|
|
+# 加载模型
|
|
|
+# loaded_model = CatBoostClassifier()
|
|
|
+# loaded_model.load_model('catboost_model.bin')
|
|
|
+#
|
|
|
+# 使用模型进行预测
|
|
|
+preds_class = model.predict(X_test)
|
|
|
+preds_proba = model.predict_proba(X_test)
|
|
|
|
|
|
+# 输出预测结果
|
|
|
+print('Predicted classes:', preds_class)
|
|
|
+print('Predicted probabilities:', preds_proba)
|
|
|
+# 获取 preds_proba 的第一列,打印最大值、最小值
|
|
|
+print(preds_proba[:, 0].max())
|
|
|
+print(preds_proba[:, 0].min())
|
|
|
|
|
|
-# 调用插入数据函数
|
|
|
-insert_data(2, 'John Doe')
|
|
|
-
|
|
|
-# 调用根据ID查询数据函数
|
|
|
-get_data_by_id(2)
|
|
|
+#
|
|
|
+# from sklearn.metrics import classification_report, confusion_matrix
|
|
|
+# from sklearn.metrics import precision_recall_curve, roc_curve, auc
|
|
|
+#
|
|
|
+# print(classification_report(y_test, model.predict(X_test)))
|
|
|
+# confusion = confusion_matrix(y_test, model.predict(X_test), normalize='all')
|
|
|
+# print(confusion)
|
|
|
+# # AUC ROC Curve plotting
|
|
|
+# probs = model.predict_proba(X_test)
|
|
|
+# preds = probs[:, 1]
|
|
|
+# fpr, tpr, threshold = roc_curve(y_test, preds)
|
|
|
+# roc_auc = auc(fpr, tpr)
|
|
|
+#
|
|
|
+# # plt.figure(figsize = (12, 7))
|
|
|
+# plt.title('Receiver Operating Characteristic', weight='bold')
|
|
|
+# plt.plot(fpr, tpr, 'b', label='XGBClassifier (AUC = %0.2f)' % roc_auc)
|
|
|
+# plt.legend(loc='lower right')
|
|
|
+# plt.plot([0, 1], [0, 1], 'r--')
|
|
|
+# plt.ylabel('True Positive Rate', fontsize=12)
|
|
|
+# plt.xlabel('False Positive Rate', fontsize=12)
|
|
|
+# plt.show()
|