tianyunperfect 2 年之前
父節點
當前提交
190c6a791a
共有 4 個文件被更改,包括 64 次插入47 次删除
  1. 29 9
      tmp.py
  2. 1 1
      tmp/kafka/kafkaReceive.py
  3. 9 25
      tmp2.py
  4. 25 12
      tmp4.py

+ 29 - 9
tmp.py

@@ -1,13 +1,33 @@
-import requests
+# 导入必要的库
+import pandas as pd
+from catboost import CatBoostClassifier, Pool
+from sklearn.model_selection import train_test_split
 
-url = "https://api.btdp.k7.bigtree.tech/api/fakeTenant/ai_risk_model_monitor/1.0/app/dh_date?date=2023-02"
+# 加载示例数据集
+data = pd.read_csv('/Users/alvin/Downloads/mushrooms.csv')
+X = pd.get_dummies(data.drop('class', axis=1))
+y = data['class']
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
-payload={}
-headers = {
-    'apikey': '45f51e0b38e14178ad31e4eda44e42cd',
-    'apisecret': 'kEqdZha5xY448kh/JpwniIs74/pX5lPN8AT9lxLnjbGtFIdW32zbjT34Kxp52vqgofC43b/RSOEHPNgaKequKA=='
-}
+# 定义CatBoost分类器并训练模型
+model = CatBoostClassifier(iterations=100, depth=2, learning_rate=0.1, loss_function='Logloss')
+model.fit(X_train, y_train, verbose=False)
 
-response = requests.request("GET", url, headers=headers, data=payload)
+# 评估模型性能
+print('Train accuracy:', model.score(X_train, y_train))
+print('Test accuracy:', model.score(X_test, y_test))
 
-print(response.text)
+# 保存模型
+model.save_model('catboost_model.bin')
+
+# 加载模型
+loaded_model = CatBoostClassifier()
+loaded_model.load_model('catboost_model.bin')
+
+# 使用模型进行预测
+preds_class = loaded_model.predict(X_test)
+preds_proba = loaded_model.predict_proba(X_test)
+
+# 输出预测结果
+print('Predicted classes:', preds_class)
+print('Predicted probabilities:', preds_proba)

+ 1 - 1
tmp/kafka/kafkaReceive.py

@@ -11,4 +11,4 @@ consumer = KafkaConsumer(sys.argv[2],
                          bootstrap_servers=sys.argv[1].split(",")
                          )
 for msg in consumer:
-    print(datetime.datetime.now(), msg.value.decode('utf-8').encode('utf-8').decode('unicode_escape'))
+    print(datetime.datetime.now(), msg.value.decode('utf-8'))

+ 9 - 25
tmp2.py

@@ -1,29 +1,13 @@
-import requests
+from sklearn.preprocessing import MinMaxScaler
 
+# 原始数据集
+data = [[1000, 200000], [1500, 300000], [1200, 240000], [1800, 360000]]
 
-def get_list(month: str, target: str):
-    headers = {
-        'apikey': '45f51e0b38e14178ad31e4eda44e42cd',
-        'apisecret': 'kEqdZha5xY448kh/JpwniIs74/pX5lPN8AT9lxLnjbGtFIdW32zbjT34Kxp52vqgofC43b/RSOEHPNgaKequKA=='
-    }
-    date_res = requests.get("https://api.btdp.k7.bigtree.tech/api/fakeTenant/ai_risk_model_monitor/1.0/app/dh_date?", headers=headers, params={"date": month})
-    response = requests.get("https://api.btdp.k7.bigtree.tech/api/fakeTenant/ai_risk_model_monitor/1.0/app/dh_monitor", headers=headers,
-                            params={
-                                'update_dt': month,
-                                'tag': target,
-                                'model_id': 'ai_tax_v2',
-                                'startDate': date_res.json()['startDate'],
-                                'endDate': date_res.json()['endDate']
-                            })
-    return response.json()
+# 定义MinMaxScaler对象
+scaler = MinMaxScaler()
 
+# 对数据进行归一化
+data_normalized = scaler.fit_transform(data)
 
-list1 = get_list("2023-02", "dpd7")
-print(list1)
-
-# res_list = [
-#     {
-#         "inModel": "{\"fraud18\":27.0,\"drs_nodebtscore\":41.0,\"tl_id_m6_nbank_passorg\":1.0,\"cn_zerodeclaration_12m\":0.0,\"cn_zeroincome_24m\":0,\"cv_vat_sales_12m\":2.2078623149,\"growth_index\":null,\"income_duty_amount_24m\":11382.1,\"income_tax_12m\":91843.36,\"income_tax_12m_chu_income_duty_amount_12m\":0.9999999985,\"late_fee_24m\":0.0,\"month_late_fee_24m\":0,\"now_amount_tax_arrears_24m\":0.0,\"now_month_tax_arrears_12m\":0,\"slope_vat_sales_12m\":-0.0745797193,\"vat_duty_amount_12m\":132939.54,\"vat_duty_month_3m\":1,\"vat_sales_huanbi_1y\":0.0395612354,\"vat_sales_huanbi_6m\":-0.3750672378,\"penalty_count_total\":0,\"registcapi_chu_reccap\":0.7310585786,\"stockpercent\":80.0,\"stockpercent_jicha\":60.0}",
-#         "tag": "0",
-#         "score": "784       "
-#     }]
+# 输出归一化后的数据集
+print(data_normalized)

+ 25 - 12
tmp4.py

@@ -1,12 +1,25 @@
-from kafka import KafkaConsumer
-
-# tag_sink_topic\tag_atom_topic
-consumer = KafkaConsumer('tag_atom_topic',
-                         group_id='test115',  # 一个组消费一次
-                         auto_offset_reset='latest',  # 从最新数据读取,earliest,latest
-                         bootstrap_servers=['kafka-0.kafka-headless.rxdpdev.svc.k5.bigtree.zone:9092', 'kafka-1.kafka-headless.rxdpdev.svc.k5.bigtree.zone:9092', 'kafka-2.kafka-headless.rxdpdev.svc.k5.bigtree.zone:9092']
-                         # bootstrap_servers=['kafka-0.kafka-headless.aimpdev.svc.k5.bigtree.zone:9092', 'kafka-1.kafka-headless.aimpdev.svc.k5.bigtree.zone:9092', 'kafka-2.kafka-headless.aimpdev.svc.k5.bigtree.zone:9092']
-                         # bootstrap_servers=['kafka-0.kafka-headless.aimptest.svc.k5.bigtree.zone:9092', 'kafka-1.kafka-headless.aimptest.svc.k5.bigtree.zone:9092', 'kafka-2.kafka-headless.aimptest.svc.k5.bigtree.zone:9092']
-                         )
-for msg in consumer:
-    print(msg.value)
+import pandas as pd
+import numpy as np
+from scipy.stats import ttest_ind
+
+# 读取数据文件并创建 DataFrame 对象
+data = pd.read_csv('/Users/alvin/Downloads/sales.csv')
+
+# 数据清洗和预处理
+data = data.dropna()  # 删除缺失值
+data = data[data['sales'] > 0]  # 删除销售数量为 0 的数据
+
+# 计算销售总量和总价
+data['total_sales'] = data['sales'] * data['price']
+
+# 对数据进行分组统计,按照 'product_type' 列分组,然后计算每个分组中 'total_sales' 列的总和、平均值和标准差
+grouped_data = data.groupby('product_type')['total_sales'].agg([np.sum, np.mean, np.std])
+
+# 对两个产品类型的销售总量进行 T 检验,判断它们的均值是否有显著差异
+product1_sales = data[data['product_type'] == 'product1']['total_sales']
+product2_sales = data[data['product_type'] == 'product2']['total_sales']
+t_statistic, p_value = ttest_ind(product1_sales, product2_sales)
+
+# 输出统计结果
+print('销售数据统计结果:\n', grouped_data)
+print('两种产品类型的销售总量 T 检验结果:\n', 'T 统计量 =', t_statistic, 'p 值 =', p_value)