Commit a312ba8e authored by rico.liu's avatar rico.liu

update

parent 4b96f5b3
# Auto detect text files and perform LF normalization
* text=auto
# SimilarCharactor
基于音形码,EditDistance的字符串纠正相似度算法
音形码格式:【韵母,声母,结构,四角编码,笔画数】 共8位
音形码相似度算法 参考博客https://blog.csdn.net/chndata/article/details/41114771
TODO 字符串错误匹配算法 参考
结构、四角编码 抓取http://zidian.miaochaxun.com 数据
韵母、声母 使用pinyin包
笔画数抓取https://bihua.51240.com 数据
入口函数在string_similarity.py
繁简切换 Done
ongoing 相似度分值映射调整(sigmod函数映射)
TODO 字符串包含关系
ongoing 相似度算法添加与调整(bm25)
TODO 字符串错位
char_number_directionary = {
'0':'零',
'1':'一',
'2':'二',
'3':'三',
'4':'四',
'5':'五',
'6':'六',
'7':'七',
'8':'八',
'9':'九'
}
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
def minEditDist(sm, sn):
m, n = len(sm) + 1, len(sn) + 1
matrix = [[0] * n for i in range(m)]
matrix[0][0] = 0
for i in range(1, m):
matrix[i][0] = matrix[i - 1][0] + 1
for j in range(1, n):
matrix[0][j] = matrix[0][j - 1] + 1
cost = 0
for i in range(1, m):
for j in range(1, n):
if sm[i - 1] == sn[j - 1]:
cost = 0
else:
cost = 1
matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost)
for i in range(m):
print
matrix[i]
return matrix[m - 1][n - 1]
final_code_dictionary={
'b':'1',
'p':'2',
'm':'3',
'f':'4',
'd':'5',
't':'6',
'n':'7',
'l':'7',
'g':'8',
'k':'9',
'h':'A',
'j':'B',
'q':'C',
'x':'D',
'zh':'E',
'ch':'F',
'sh':'G',
'r':'H',
'z':'E',
'c':'F',
's':'G',
'y':'I',
'w':'J',
'0':'0'
}
\ No newline at end of file
initial_code_dictionary = {
'a':'1',
'o':'2',
'e':'3',
'i':'4',
'u':'5',
'v':'6',
'ai':'7',
'ei':'7',
'ui':'8',
'ao':'9',
'ou':'A',
'iu':'B',
'ie':'C',
've':'D',
'er':'E',
'an':'F',
'en':'G',
'in':'H',
'un':'I',
'ven':'J',
'ang':'F',
'eng':'G',
'ing':'H',
'ong':'k',
'uo':'L',
'ian':'M',
'iao':'N',
'uai':'O',
'uan':'P',
'uang':'Q',
'ua':'R',
'iong':'S',
'ia':'T',
'iang':'O',
'ue':'P',
'':'0'
}
\ No newline at end of file
This diff is collapsed.
similar_pronunciation_dictionary = {
'n':'l',
'l':'n',
'an':'ang',
'ang':'an',
'en':'eng',
'eng':'en',
'in':'ing',
'ing':'in',
'z':'zh',
'c':'ch',
's':'sh',
'zh':'z',
'ch':'c',
'sh':'s'
}
\ No newline at end of file
from SimilarCharactor.string_util import string2code,traditional2simplified
from SimilarCharactor.edit_distance import minEditDist
import difflib
import Levenshtein
def similarity_cn(string1,string2):
code_string1 = string2code(traditional2simplified(string1))
code_string2 = string2code(traditional2simplified(string2))
distance = minEditDist(code_string1,code_string2)
return 1 - distance/max(len(code_string1),len(code_string2))
def similarity_en(string1,string2):
fraction_part1 = Levenshtein.ratio(string1,string2)
fraction_part2 = difflib.SequenceMatcher(None, string1, string2).quick_ratio()
fraction = fraction_part1*0.5+fraction_part2*0.5
return fraction
from pypinyin import pinyin,Style,lazy_pinyin
from SimilarCharactor.quadrilateral_code_dictionary import quadrilateral_code_dictionary as qcd
from SimilarCharactor.structure_code_dictionary import structure_code_dictionary as scd
from SimilarCharactor.initial_code_dictionary import initial_code_dictionary as icd
from SimilarCharactor.final_code_dictionary import final_code_dictionary as fcd
from SimilarCharactor.write_number_dictionary import write_number_dictionary as wnd
from SimilarCharactor.character import symbol_lst
from SimilarCharactor.code_directionary import code_directionary as cd
from SimilarCharactor.char_number_directionary import char_number_directionary as cnd
from zhconv import convert
def extract_initial_and_final(pinyin_string):
if pinyin_string[0:2] not in ['zh','ch','sh']:
if pinyin_string[0] not in ['b','p','m','f','d','t','n','l','g','k','h','j','q','x','r','z','c','s','y','w']:
final = '0'
initial = pinyin_string
else:
final = pinyin_string[0] # 此处四行为声母韵母抽取
initial = pinyin_string[1:]
else:
final = pinyin_string[0:2]
initial = pinyin_string[2:]
return initial,final
#编码格式【韵母,声母,结构,四角编码,笔画数】 共8位
def string2code(string):
code_string = ''
for char in string:
if char in ['1','2','3','4','5','6','7','8','9','0']:
char = cnd[char]
code_string = code_string + cd.get(char,'')
return code_string
#计算每个汉字的音形码
def get_code():
char_array = symbol_lst()
file = open('./SimilarCharactor/音型码.txt','w+')
for char in char_array:
pinyin_char = lazy_pinyin(char)[0]
initial, final = extract_initial_and_final(pinyin_char)
code_string = icd[initial] + fcd[final] + scd[char] + qcd[char] + wnd[char]
file.writelines("'"+ char+"':'"+code_string+"',\n")
file.close()
def traditional2simplified(string):
return convert(string, 'zh-cn')
get_code()
\ No newline at end of file
This diff is collapsed.
# -*-coding:utf-8-*-
# 此模块用于爬取汉字结构字典
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
def get_url():
file1 = open('C:/Users/fooww/Desktop/cv/Word_Structure_Dict.txt', 'w', encoding='utf-8')
file1.truncate()
file1.close()
key_word_lst = ['danyi', 'zuoyou', 'shangxia', 'zuozhongyou', 'shangzhongxia', 'youshangbaowei', 'zuoshangbaowei', 'zuoxiabaowei','shangsanbaowei','xiasanbaowei','zuosanbaowei','quanbaowei','xiangqian','pinzi']
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/63.0.3239.132 Safari/537.36'}
for index, key_word in enumerate(key_word_lst):
file1 = open('C:/Users/fooww/Desktop/cv/Word_Structure_Dict.txt', 'a', encoding='utf-8')
url = 'http://zidian.miaochaxun.com/'+key_word+'.html'
print(url)
res1 = requests.get(url, headers=header)
res1.encoding = 'utf-8'
soup1 = BeautifulSoup(res1.text, 'html.parser')
zi_list = soup1.find_all('p', class_='zi')
for s in zi_list:
[p.extract() for p in s.find_all('span')]
for s in zi_list:
for word in s.find_all('a'):
# print(word.get_text())
try:
if index<10:
tag = index
else:
tag = chr(55+index)
file1.write("'{0}':'{1}',\n".format(word.get_text(), tag))
except TypeError:
pass
continue
file1.close()
if __name__ == '__main__':
get_url()
def pinyin_2_hanzi(pinyinList):
from Pinyin2Hanzi import DefaultDagParams
from Pinyin2Hanzi import dag
dagParams = DefaultDagParams()
result = dag(dagParams, pinyinList, path_num=1, log=True)#10代表侯选值个数
for item in result:
socre = item.score
res = item.path # 转换结果
print(socre, res)
pinyin_2_hanzi(['hao kai xin'])
\ No newline at end of file
# coding=utf-8
import os
import pygame
import character
# 此程序用于将汉字转图片输出,以便利用opencv进行相似度识别
chinese_dir = 'D:/py/chinese/'
if not os.path.exists(chinese_dir):
os.mkdir( chinese_dir)
pygame.init()
for i,word in enumerate(character.symbol_lst()):
font = pygame.font.Font("C:\Windows\Fonts\msyh.ttf", 100) # 当前目录下要有微软雅黑的字体文件msyh.ttc,或者去c:\Windows\Fonts目录下找
rtext = font.render(word, True, (0, 0, 0), (255, 255, 255))
pygame.image.save(rtext, os.path.join(chinese_dir+ str(i) + ".png"))
import requests
from bs4 import BeautifulSoup
import re
import character
from tqdm import tqdm
def transutf8(symbol):
symbol = str(symbol.encode('utf-8'))
utf8_code = symbol[4:6]+symbol[8:10]+symbol[12:14]
return utf8_code
def writenum(symbol):
url_head = 'https://bihua.51240.com/'
url_tail = '__bihuachaxun/'
# 遍历输入汉字的utf8编码,爬取对应的笔画数
url_mid = transutf8(symbol)
url = url_head + url_mid + url_tail
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
pattern = re.compile('笔画数')
pattern2 = re.compile('\d{1,2}')
write_num = soup.find('td',text=pattern).parent.find('td', text=pattern2).get_text()
return write_num
def get_dict():
symbol_lst = character.symbol_lst()
write_num_dict = {}
for char_one in tqdm(symbol_lst):
write_num_dict[char_one] = writenum(char_one)
return write_num_dict
def main():
with open("D:/py/write_num.txt", 'w') as f:
for i,j in get_dict().items():
f.write(i+' '+j)
f.close()
if __name__ == '__main__':
main()
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -162,5 +162,5 @@ def get_point_category_params_data(category): ...@@ -162,5 +162,5 @@ def get_point_category_params_data(category):
conn_zi_new.close() conn_zi_new.close()
category = '复印纸' category = '一体电脑'
get_point_category_params_data(category) get_point_category_params_data(category)
\ No newline at end of file
...@@ -9,7 +9,7 @@ Created on Thu Feb 25 10:53:59 2021 ...@@ -9,7 +9,7 @@ Created on Thu Feb 25 10:53:59 2021
import pymssql import pymssql
import pandas as pd import pandas as pd
from public import Index from public import Index
import uuid
def transform_simplevalue(cursor_zi_new,shujuzidiandf,categoryname,subtitle,stdvalue): def transform_simplevalue(cursor_zi_new,shujuzidiandf,categoryname,subtitle,stdvalue):
...@@ -92,6 +92,9 @@ def upload_params_data(category,path): ...@@ -92,6 +92,9 @@ def upload_params_data(category,path):
subtitle_id_dict = dict(zip(db_params['subtitle'].tolist(),db_params['subtitleid'].tolist())) subtitle_id_dict = dict(zip(db_params['subtitle'].tolist(),db_params['subtitleid'].tolist()))
subtitle_type_dict = dict(zip(db_params['subtitle'].tolist(),db_params['skuorspu'].tolist())) subtitle_type_dict = dict(zip(db_params['subtitle'].tolist(),db_params['skuorspu'].tolist()))
#获取类别ID
categoryid = db_params['categoryid'].tolist()[0]
#获取命名规则 #获取命名规则
cursor_zi_new.execute(f"select * from skuname_named_rule where categoryname = '{category}'") cursor_zi_new.execute(f"select * from skuname_named_rule where categoryname = '{category}'")
named_rules_df = pd.DataFrame(cursor_zi_new.fetchall(), columns=[tuple[0] for tuple in cursor_zi_new.description]) named_rules_df = pd.DataFrame(cursor_zi_new.fetchall(), columns=[tuple[0] for tuple in cursor_zi_new.description])
...@@ -111,7 +114,7 @@ def upload_params_data(category,path): ...@@ -111,7 +114,7 @@ def upload_params_data(category,path):
process_index = 0 process_index = 0
index_ = Index() index_ = Index()
#暂不支持复印纸命名 #SPU、SKU命名
for index,row in df.iterrows(): for index,row in df.iterrows():
try: try:
print(index_(process_index,len(df)-1), end='%') print(index_(process_index,len(df)-1), end='%')
...@@ -121,17 +124,80 @@ def upload_params_data(category,path): ...@@ -121,17 +124,80 @@ def upload_params_data(category,path):
sku = row['产品编码'] sku = row['产品编码']
brandname = row['产品品牌'] brandname = row['产品品牌']
cursor_zi_new.execute(f"select id from p_brand where name = '{brandname}' and id not in (select distinct pid from p_brand)")
brand_code = cursor_zi_new.fetchone()[0]
cursor_zi_new.execute(f"select pid from p_brand where id = {brand_code}")
father_brand_code = cursor_zi_new.fetchone()[0]
params = dict(zip(subtitle_list,[row[subtitle] for subtitle in subtitle_list])) params = dict(zip(subtitle_list,[row[subtitle] for subtitle in subtitle_list]))
cursor_zi_new.execute(f"select id,spuid from p_sku where sku = '{sku}'") cursor_zi_new.execute(f"select id,spuid from p_sku where sku = '{sku}'")
skuid,spuid = cursor_zi_new.fetchone() skuid,spuid = cursor_zi_new.fetchone()
spuname = ''
skuname = '' skuname = ''
#SPU命名
if category == '复印纸':
if '彩色' in row['产品系列']:
spu_brandname = brandname
else:
if father_brand_code == 0:
spu_brandname = brandname
else:
cursor_zi_new.execute(f"select name from p_brand where id = {father_brand_code}")
father_brand = cursor_zi_new.fetchone()[0].strip()
spu_brandname = father_brand
else:
spu_brandname = brandname
name = str(spu_brandname)
for spu_param in db_params[db_params['skuorspu'] == 'spu'].sort_values("Expr1")['subtitle'].tolist():
if spu_param == '产品品牌':
continue
else:
name += " "
name += str(row[spu_param])
name = name + " " +str(category)#spu名称
spu = str(uuid.uuid1()).replace('-','')#spu编码
cursor_zi_new.execute(f"select id from p_spu where categoryid = {categoryid} and brandid = {brand_code} and spuname = '{name}'")
data = cursor_zi_new.fetchall()
spu_df = pd.DataFrame(data, columns=[tuple[0] for tuple in cursor_zi_new.description])
if spu_df.empty:
cursor_zi_new.execute(f"insert into p_spu (spuname,spu,categoryid,brandid) values ('{name}','{spu}',{categoryid},{brand_code})")
cursor_zi_new.execute(f"select id from p_spu where spuname = '{name}'")
spuid = cursor_zi_new.fetchone()[0]
else:
spuid = spu_df['id'].tolist()[0]
cursor_zi_new.execute(f"update p_sku set spuid = '{spuid}' where id = '{skuid}'")
#SKU命名
for element in named_rule.split(" "): for element in named_rule.split(" "):
if element == '品牌名称': if element == '品牌名称':
#复印纸品牌获取(彩色系列取子品牌,白色系列有父品牌取父品牌,否则取子品牌)
if category == '复印纸':
if '彩色' in row['产品系列']:
value = brandname
else:
if father_brand_code == 0:
value = brandname
else:
cursor_zi_new.execute(f"select name from p_brand where id = {father_brand_code}")
father_brand = cursor_zi_new.fetchone()[0].strip()
value = father_brand
else:
value = brandname value = brandname
elif element == '类别名称': elif element == '类别名称':
...@@ -172,6 +238,8 @@ def upload_params_data(category,path): ...@@ -172,6 +238,8 @@ def upload_params_data(category,path):
value = transform_simplevalue(cursor_zi_new,single_data_dict,category,element,value.upper()) value = transform_simplevalue(cursor_zi_new,single_data_dict,category,element,value.upper())
if element == '计价规格(包/箱或单包装)':
value = '计价规格:' + value
skuname += value + " " skuname += value + " "
...@@ -224,8 +292,11 @@ def upload_params_data(category,path): ...@@ -224,8 +292,11 @@ def upload_params_data(category,path):
skuname_2 = dael_name_content(skuname_part2,brandname).replace(brandname,"") skuname_2 = dael_name_content(skuname_part2,brandname).replace(brandname,"")
skuname = skuname_part1 + " " +category + skuname_2 skuname = skuname_part1 + " " +category + skuname_2
try:
cursor_zi_new.execute(f"update p_sku set skuname = '{skuname}' where sku = '{sku}'") cursor_zi_new.execute(f"update p_sku set skuname = '{skuname}' where sku = '{sku}'")
except:
skuname_double = skuname + '(重复)'
cursor_zi_new.execute(f"update p_sku set skuname = '{skuname_double}',state = '6' where sku = '{sku}'")
#print(skuname) #print(skuname)
...@@ -325,7 +396,7 @@ def upload_params_data(category,path): ...@@ -325,7 +396,7 @@ def upload_params_data(category,path):
print(f"{category_name}参数数据更新完成,命名完成") print(f"{category_name}参数数据更新完成,命名完成")
category_name = '' category_name = '复印纸'
path = '' path = '/Users/rico/Work Space/1_Project/Company/中电中采/数据处理项目_重点类信息提取/复印纸修改0329.xlsx'
upload_params_data(category_name,path) upload_params_data(category_name,path)
\ No newline at end of file
...@@ -7,7 +7,8 @@ Created on Wed Feb 24 15:07:58 2021 ...@@ -7,7 +7,8 @@ Created on Wed Feb 24 15:07:58 2021
""" """
import pandas as pd import pandas as pd
import pymssql import pymssql
from public import zgc_api from public import zgc_api,Index
from SimilarCharactor.string_similarity import similarity_cn,similarity_en
def check_data(category_name,path): def check_data(category_name,path):
...@@ -16,14 +17,17 @@ def check_data(category_name,path): ...@@ -16,14 +17,17 @@ def check_data(category_name,path):
df = df.rename(columns = {'Unnamed: 0':'id'}) df = df.rename(columns = {'Unnamed: 0':'id'})
if '无参数,需补充' in df.values: #if '无参数,需补充' in df.values:
print("仍有参数未补充,请补充完整后再继续处理。") # print("仍有参数未补充,请补充完整后再继续处理。")
return False # return False
#创建新产品库链接 #创建新产品库链接
conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='zgcprice20200628',database= 'ZI_NEW',autocommit=True) conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='zgcprice20200628',database= 'ZI_NEW',autocommit=True)
cursor_zi_new = conn_zi_new.cursor() cursor_zi_new = conn_zi_new.cursor()
#加载数据字典
cursor_zi_new.execute(f"select subtitle,stdvalue,primitive,simplevalue from ShuJuZiDian_Cfg where categoryname = '{category_name}'")
data_dict = pd.DataFrame(cursor_zi_new.fetchall(), columns=[tuple[0] for tuple in cursor_zi_new.description])
#进行非标转标 #进行非标转标
##加载库中参数项数据(参数id,参数从属) ##加载库中参数项数据(参数id,参数从属)
...@@ -74,8 +78,22 @@ def check_data(category_name,path): ...@@ -74,8 +78,22 @@ def check_data(category_name,path):
#非标准值添加数据字典 #非标准值添加数据字典
flag = False flag = False
dict_id_list = list() dict_id_list = list()
process_index = 0
index_ = Index()
for index,row in df.iterrows(): for index,row in df.iterrows():
try:
print(index_(process_index,len(df)-1), end='%')
except:
print(index_(process_index,1), end='%')
process_index+=1
id_ = str(row['id']) id_ = str(row['id'])
eg_brand_name = row['产品品牌']
eg_product_name = row['产品名称']
for db_param in db_params['subtitle'].tolist(): for db_param in db_params['subtitle'].tolist():
if db_param in ['产品型号', 'CPU属性']:#CPU属性为衍生属性,需要特殊处理 if db_param in ['产品型号', 'CPU属性']:#CPU属性为衍生属性,需要特殊处理
continue continue
...@@ -89,8 +107,32 @@ def check_data(category_name,path): ...@@ -89,8 +107,32 @@ def check_data(category_name,path):
columns=[tuple[0] for tuple in cursor_zi_new.description]) columns=[tuple[0] for tuple in cursor_zi_new.description])
if check_df.empty: if check_df.empty:
#推荐数据字典值
init_fraction = 0
recommend_pri = ''
for dict_pri in data_dict[data_dict['subtitle'] == db_param]['primitive'].unique().tolist():
try:
fraction_part1 = similarity_cn(pri_value,dict_pri)
except:
#print(f"无法比较 '{pri_value}'与 '{dict_pri}'")
#fraction = 0
fraction_part1 = 0
fraction_part2 = similarity_en(pri_value,dict_pri)
fraction = fraction_part1 * 0.5 + fraction_part2 * 0.5
if fraction > init_fraction:
init_fraction = fraction
recommend_pri = dict_pri
recommend_stdvalue = data_dict[data_dict['primitive'] == recommend_pri]['stdvalue'].tolist()[0]
recommend_simplevalue = data_dict[data_dict['primitive'] == recommend_pri]['simplevalue'].tolist()[0]
flag = True flag = True
cursor_zi_new.execute(f"insert into ShuJuZiDian_Cfg (categoryname,subtitle,primitive) values ('{category_name}','{db_param}','{pri_value}')") cursor_zi_new.execute(f"insert into ShuJuZiDian_Cfg (categoryname,subtitle,primitive,eg_brand_name,eg_product_name,recommend_primitive,recommend_stdvalue,recommend_simplevalue) \
values ('{category_name}','{db_param}','{pri_value}','{eg_brand_name}','{eg_product_name}','{recommend_pri}','{recommend_stdvalue}','{recommend_simplevalue}')")
cursor_zi_new.execute(f"select id from ShuJuZiDian_Cfg where categoryname = '{category_name}' and subtitle = '{db_param}' and primitive = '{pri_value}'") cursor_zi_new.execute(f"select id from ShuJuZiDian_Cfg where categoryname = '{category_name}' and subtitle = '{db_param}' and primitive = '{pri_value}'")
dict_id = cursor_zi_new.fetchone()[0] dict_id = cursor_zi_new.fetchone()[0]
...@@ -142,8 +184,8 @@ def check_data(category_name,path): ...@@ -142,8 +184,8 @@ def check_data(category_name,path):
print(f"{category_name}数据通过校验,可更新库内参数") print(f"{category_name}数据通过校验,可更新库内参数")
category_name = '' category_name = '一体电脑'
path = '' path = '/Users/rico/Work Space/1_Project/Company/中电中采/数据处理项目_重点类信息提取/一体电脑/一体电脑参数确认.xlsx'
check_data(category_name,path) check_data(category_name,path)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment