|
@@ -1,114 +1,17 @@
|
|
|
-def exec(obj):
|
|
|
- from sqlalchemy import create_engine
|
|
|
- import pandas as pd
|
|
|
- from urllib.parse import quote_plus as urlquote
|
|
|
- import jieba
|
|
|
- from gensim import corpora, models, similarities
|
|
|
+import jieba
|
|
|
|
|
|
- def cReate_dAta_cOnn():
|
|
|
- engine = create_engine('mysql+pymysql://aimp_user:'+urlquote('vjeygLP76n7%UPx@')+'@rm-qsls3302.mysql.rds.aliyuncs.com:3302/bi_application')
|
|
|
- return engine
|
|
|
+# 设置用户词典(可选)
|
|
|
+# jieba.load_userdict("userdict.txt")
|
|
|
|
|
|
- def rEad_aNd_dAtaframe(sql_query):
|
|
|
- engine = cReate_dAta_cOnn()
|
|
|
- df = pd.read_sql(sql_query,engine)
|
|
|
- engine.dispose()
|
|
|
- return df
|
|
|
+# 待分词的文本
|
|
|
+text = "田云 住在什么地方"
|
|
|
|
|
|
- def flag(x):
|
|
|
- if (x['sensoir'] == x['person']) and len(x['sensoir'])==3:
|
|
|
- return 1
|
|
|
- elif (x['sensoir'] == x['person']) and len(x['sensoir'])==2:
|
|
|
- return 2
|
|
|
- else:
|
|
|
- return 3
|
|
|
+# 精确模式分词
|
|
|
+seg_list = jieba.cut(text, cut_all=False)
|
|
|
|
|
|
- def flag1(x):
|
|
|
- if x['f1'] == 1:
|
|
|
- return 1
|
|
|
- else:
|
|
|
- return 0
|
|
|
+print("精确模式分词结果:", "/ ".join(seg_list))
|
|
|
|
|
|
- def flag2(x):
|
|
|
- if (x['f1'] == 2) and x['sims']>0.8:
|
|
|
- return 1
|
|
|
- else:
|
|
|
- return 0
|
|
|
+# 全模式分词
|
|
|
+seg_list = jieba.cut(text, cut_all=True)
|
|
|
|
|
|
- def flag3(x):
|
|
|
- if x['sims']>0.95:
|
|
|
- return 1
|
|
|
- else:
|
|
|
- return 0
|
|
|
-
|
|
|
- def cal_similar(doc_goal,ssim):
|
|
|
- doc = rEad_aNd_dAtaframe('''select distinct credit_no,econ_reg_address from ext_anti_fraud_address ''')
|
|
|
- doc_list = [jieba.lcut(w) for w in doc['econ_reg_address']]
|
|
|
- target = [word for word in jieba.cut(doc_goal)]
|
|
|
- dictionary = corpora.Dictionary(doc_list)
|
|
|
- corpus = [dictionary.doc2bow(doc) for doc in doc_list]
|
|
|
- doc_goal_vec = dictionary.doc2bow(target)
|
|
|
- tfidf = models.TfidfModel(corpus)
|
|
|
- index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = len(dictionary.keys()))
|
|
|
- sims = index[tfidf[doc_goal_vec]]
|
|
|
- similary = pd.DataFrame({"risk_address": list(doc['econ_reg_address']), "sims": list(sims)})
|
|
|
- similary["申请企业注册地址"] = doc_goal
|
|
|
- similary_data = similary[["申请企业注册地址", "risk_address", "sims"]].drop_duplicates()
|
|
|
- similary_data= similary_data[similary_data["sims"]>=ssim]
|
|
|
- return similary_data
|
|
|
-
|
|
|
- lcity = list(obj['city'])
|
|
|
- if len(lcity)>1 and len(lcity[0])>1:
|
|
|
- lcity = list(obj['city'])[0]
|
|
|
- elif len(lcity)==1 and len(lcity[0])>1:
|
|
|
- lcity = obj['city']
|
|
|
- else:
|
|
|
- obj['flag1'] = 0
|
|
|
- obj['flag2'] = 0
|
|
|
- obj['flag3'] = 0
|
|
|
- return obj
|
|
|
-credit = []
|
|
|
-address = []
|
|
|
-sensoir = list(obj['oper_names'])
|
|
|
-if len(sensoir)<1:
|
|
|
- obj['flag1'] = 0
|
|
|
- obj['flag2'] = 0
|
|
|
- credit.append(obj['credit_code'])
|
|
|
- address.append(obj['address_'])
|
|
|
- df_app = pd.DataFrame({"credit": credit,"address": address})
|
|
|
- df_add = rEad_aNd_dAtaframe("select distinct credit_no,econ_reg_address from ext_anti_fraud_address where city = {}".format(lcity))
|
|
|
- similary_data=cal_similar(df_app['address'].max(),0.95)
|
|
|
- if similary_data.shape[0]>0:
|
|
|
- obj['flag3'] = 1
|
|
|
- else:
|
|
|
- obj['flag3'] = 0
|
|
|
- obj['similary_data']=similary_data.to_json(orient='records')
|
|
|
- return obj
|
|
|
-else:
|
|
|
- for i in sensoir:
|
|
|
- credit.append(obj['credit_code'])
|
|
|
- address.append(obj['address_'])
|
|
|
- df_app = pd.DataFrame({"credit": credit, "sensoir": sensoir,"address": address})
|
|
|
- df_add = rEad_aNd_dAtaframe("select distinct credit_no,econ_reg_address from ext_anti_fraud_address where city = {}".format(lcity))
|
|
|
- lcredit =str(list(df_add['credit_no'])).replace('[','').replace(']','')
|
|
|
- df_per = rEad_aNd_dAtaframe("select distinct credit_no,person from ext_anti_fraud_senior_person where credit_no in ({})".format(lcredit))
|
|
|
- df_dec = pd.merge(df_app,df_per,left_on = 'sensoir',right_on = 'person',how = 'inner')
|
|
|
- df_f = pd.merge(df_dec,df_add,on = 'credit_no',how = 'left')
|
|
|
- if df_f.shape[0]<1:
|
|
|
- obj['flag1'] = 0
|
|
|
- obj['flag2'] = 0
|
|
|
- obj['flag3'] = 0
|
|
|
- obj['df_f']=df_f.to_json(orient='records')
|
|
|
- return obj
|
|
|
- else:
|
|
|
- df_f['f1']=df_f.apply(flag,axis=1)
|
|
|
- similary_data=cal_similar(df_f['address'].max(),0)
|
|
|
- df = pd.merge(df_f,similary_data,left_on='econ_reg_address',right_on = 'risk_address',how = 'left')
|
|
|
- df['flag1']=df.apply(flag1,axis=1)
|
|
|
- df['flag2']=df.apply(flag2,axis=1)
|
|
|
- df['flag3']=df.apply(flag3,axis=1)#保存
|
|
|
- obj['df']=df.to_json(orient='records')
|
|
|
- obj['flag1']=df['flag1'].max()
|
|
|
- obj['flag2']=df['flag2'].max()
|
|
|
- obj['flag3']=df['flag3'].max()
|
|
|
- return obj
|
|
|
+print("全模式分词结果:", "/ ".join(seg_list))
|