init

9130348c · rico.liu · 3cbd2818 · 9130348c · 9130348c · 9130348c
Commit 9130348c authored Dec 02, 2021 by rico.liu
39 changed files
--- a/数据治理平台线下处理/ESCore/ES.py
+++ b/数据治理平台线下处理/ESCore/ES.py
--- a/数据治理平台线下处理/ESCore/__pycache__/ES.cpython-36.pyc
+++ b/数据治理平台线下处理/ESCore/__pycache__/ES.cpython-36.pyc
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/.gitattributes
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/.gitattributes
+# Auto detect text files and perform LF normalization
+* text=auto
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/.gitignore
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/.gitignore
+.idea
+__pycache__
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/README.md
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/README.md
+# SimilarCharactor
+基于音形码，EditDistance的字符串纠正相似度算法
+音形码格式：【韵母，声母，结构，四角编码，笔画数】 共8位
+音形码相似度算法 参考博客https://blog.csdn.net/chndata/article/details/41114771  
+TODO 字符串错误匹配算法 参考
+结构、四角编码 抓取http://zidian.miaochaxun.com 数据  
+韵母、声母 使用pinyin包  
+笔画数抓取https://bihua.51240.com 数据
+入口函数在string_similarity.py
+繁简切换 Done 
+ongoing 相似度分值映射调整(sigmod函数映射)
+TODO 字符串包含关系 
+ongoing 相似度算法添加与调整(bm25)  
+TODO 字符串错位  
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/char_number_directionary.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/char_number_directionary.py
+char_number_directionary = {
+    '0':'零',
+    '1':'一',
+    '2':'二',
+    '3':'三',
+    '4':'四',
+    '5':'五',
+    '6':'六',
+    '7':'七',
+    '8':'八',
+    '9':'九'
+}
\ No newline at end of file
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/character.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/character.py
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/code_directionary.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/code_directionary.py
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/edit_distance.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/edit_distance.py
+def minEditDist(sm, sn):
+    m, n = len(sm) + 1, len(sn) + 1
+    matrix = [[0] * n for i in range(m)]
+    matrix[0][0] = 0
+    for i in range(1, m):
+        matrix[i][0] = matrix[i - 1][0] + 1
+    for j in range(1, n):
+        matrix[0][j] = matrix[0][j - 1] + 1
+    cost = 0
+    for i in range(1, m):
+        for j in range(1, n):
+            if sm[i - 1] == sn[j - 1]:
+                cost = 0
+            else:
+                cost = 1
+            matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost)
+    for i in range(m):
+        print
+        matrix[i]
+    return matrix[m - 1][n - 1]
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/final_code_dictionary.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/final_code_dictionary.py
+final_code_dictionary={
+    'b':'1',
+    'p':'2',
+    'm':'3',
+    'f':'4',
+    'd':'5',
+    't':'6',
+    'n':'7',
+    'l':'7',
+    'g':'8',
+    'k':'9',
+    'h':'A',
+    'j':'B',
+    'q':'C',
+    'x':'D',
+    'zh':'E',
+    'ch':'F',
+    'sh':'G',
+    'r':'H',
+    'z':'E',
+    'c':'F',
+    's':'G',
+    'y':'I',
+    'w':'J',
+    '0':'0'
+}
\ No newline at end of file
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/initial_code_dictionary.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/initial_code_dictionary.py
+initial_code_dictionary = {
+    'a':'1',
+    'o':'2',
+    'e':'3',
+    'i':'4',
+    'u':'5',
+    'v':'6',
+    'ai':'7',
+    'ei':'7',
+    'ui':'8',
+    'ao':'9',
+    'ou':'A',
+    'iu':'B',
+    'ie':'C',
+    've':'D',
+    'er':'E',
+    'an':'F',
+    'en':'G',
+    'in':'H',
+    'un':'I',
+    'ven':'J',
+    'ang':'F',
+    'eng':'G',
+    'ing':'H',
+    'ong':'k',
+    'uo':'L',
+    'ian':'M',
+    'iao':'N',
+    'uai':'O',
+    'uan':'P',
+    'uang':'Q',
+    'ua':'R',
+    'iong':'S',
+    'ia':'T',
+    'iang':'O',
+    'ue':'P',
+    '':'0'
+}
\ No newline at end of file
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/quadrilateral_code_dictionary.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/quadrilateral_code_dictionary.py
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/similar_pronunciation_dictionary.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/similar_pronunciation_dictionary.py
+similar_pronunciation_dictionary = {
+    'n':'l',
+    'l':'n',
+    'an':'ang',
+    'ang':'an',
+    'en':'eng',
+    'eng':'en',
+    'in':'ing',
+    'ing':'in',
+    'z':'zh',
+    'c':'ch',
+    's':'sh',
+    'zh':'z',
+    'ch':'c',
+    'sh':'s'
+}
\ No newline at end of file
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/string_similarity.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/string_similarity.py
+from SimilarCharactor.string_util import string2code,traditional2simplified
+from SimilarCharactor.edit_distance import minEditDist
+import difflib
+import Levenshtein
+def similarity_cn(string1,string2):
+    code_string1 = string2code(traditional2simplified(string1))
+    code_string2 = string2code(traditional2simplified(string2))
+    distance = minEditDist(code_string1,code_string2)
+    return 1 - distance/max(len(code_string1),len(code_string2))
+def similarity_en(string1,string2):
+    fraction_part1 = Levenshtein.ratio(string1,string2)
+    fraction_part2 =  difflib.SequenceMatcher(None, string1, string2).quick_ratio()
+    fraction = fraction_part1*0.5+fraction_part2*0.5
+    return fraction
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/string_util.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/string_util.py
+from pypinyin import pinyin,Style,lazy_pinyin
+from SimilarCharactor.quadrilateral_code_dictionary import quadrilateral_code_dictionary as qcd
+from SimilarCharactor.structure_code_dictionary import structure_code_dictionary as scd
+from SimilarCharactor.initial_code_dictionary import initial_code_dictionary as icd
+from SimilarCharactor.final_code_dictionary import final_code_dictionary as fcd
+from SimilarCharactor.write_number_dictionary import write_number_dictionary as wnd
+from SimilarCharactor.character import symbol_lst
+from SimilarCharactor.code_directionary import code_directionary as cd
+from SimilarCharactor.char_number_directionary import char_number_directionary as cnd
+from zhconv import convert
+def extract_initial_and_final(pinyin_string):
+    if pinyin_string[0:2] not in ['zh','ch','sh']:
+        if pinyin_string[0] not in ['b','p','m','f','d','t','n','l','g','k','h','j','q','x','r','z','c','s','y','w']:
+            final = '0'
+            initial = pinyin_string
+        else:
+            final = pinyin_string[0]  # 此处四行为声母韵母抽取
+            initial = pinyin_string[1:]
+    else:
+        final = pinyin_string[0:2]
+        initial = pinyin_string[2:]
+    return initial,final
+#编码格式【韵母，声母，结构，四角编码，笔画数】 共8位
+def string2code(string):
+    code_string = ''
+    for char in string:
+        if char in ['1','2','3','4','5','6','7','8','9','0']:
+            char = cnd[char]
+        code_string = code_string + cd.get(char,'')
+    return code_string
+#计算每个汉字的音形码
+def get_code():
+    char_array = symbol_lst()
+    file = open('./SimilarCharactor/音型码.txt','w+')
+    for char in char_array:
+        pinyin_char = lazy_pinyin(char)[0]
+        initial, final = extract_initial_and_final(pinyin_char)
+        code_string = icd[initial] + fcd[final] + scd[char] + qcd[char] + wnd[char]
+        file.writelines("'"+ char+"':'"+code_string+"',\n")
+    file.close()
+def traditional2simplified(string):
+    return convert(string, 'zh-cn')
+get_code()
\ No newline at end of file
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/structure_code_dictionary.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/structure_code_dictionary.py
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/symbol_structure.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/symbol_structure.py
+# -*-coding:utf-8-*-
+# 此模块用于爬取汉字结构字典
+import requests
+from bs4 import BeautifulSoup
+import re
+from tqdm import tqdm
+def get_url():
+    file1 = open('C:/Users/fooww/Desktop/cv/Word_Structure_Dict.txt', 'w', encoding='utf-8')
+    file1.truncate()
+    file1.close()
+    key_word_lst = ['danyi', 'zuoyou', 'shangxia', 'zuozhongyou', 'shangzhongxia', 'youshangbaowei', 'zuoshangbaowei', 'zuoxiabaowei','shangsanbaowei','xiasanbaowei','zuosanbaowei','quanbaowei','xiangqian','pinzi']
+    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
+            Chrome/63.0.3239.132 Safari/537.36'}
+    for index, key_word in enumerate(key_word_lst):
+        file1 = open('C:/Users/fooww/Desktop/cv/Word_Structure_Dict.txt', 'a', encoding='utf-8')
+        url = 'http://zidian.miaochaxun.com/'+key_word+'.html'
+        print(url)
+        res1 = requests.get(url, headers=header)
+        res1.encoding = 'utf-8'
+        soup1 = BeautifulSoup(res1.text, 'html.parser')
+        zi_list = soup1.find_all('p', class_='zi')
+        for s in zi_list:
+            [p.extract() for p in s.find_all('span')]
+        for s in zi_list:
+            for word in s.find_all('a'):
+                # print(word.get_text())
+                try:
+                    if index<10:
+                        tag = index
+                    else:
+                        tag = chr(55+index)
+                    file1.write("'{0}':'{1}',\n".format(word.get_text(), tag))
+                except TypeError:
+                    pass
+                continue
+        file1.close()
+if __name__ == '__main__':
+    get_url()
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/test.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/test.py
+def pinyin_2_hanzi(pinyinList):
+    from Pinyin2Hanzi import DefaultDagParams
+    from Pinyin2Hanzi import dag
+    dagParams = DefaultDagParams()
+    result = dag(dagParams, pinyinList, path_num=1, log=True)#10代表侯选值个数
+    for item in result:
+        socre = item.score
+        res = item.path # 转换结果
+        print(socre, res)
+pinyin_2_hanzi(['hao kai xin'])
\ No newline at end of file
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/transform_character_2_img.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/transform_character_2_img.py
+# coding=utf-8
+import os
+import pygame
+import character
+# 此程序用于将汉字转图片输出，以便利用opencv进行相似度识别
+chinese_dir = 'D:/py/chinese/'
+if not os.path.exists(chinese_dir):
+    os.mkdir( chinese_dir)
+pygame.init()
+for i,word in enumerate(character.symbol_lst()):
+    font = pygame.font.Font("C:\Windows\Fonts\msyh.ttf", 100)  # 当前目录下要有微软雅黑的字体文件msyh.ttc,或者去c:\Windows\Fonts目录下找
+    rtext = font.render(word, True, (0, 0, 0), (255, 255, 255))
+    pygame.image.save(rtext, os.path.join(chinese_dir+ str(i) + ".png"))
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/write_num.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/write_num.py
+import requests
+from bs4 import BeautifulSoup
+import re
+import character
+from tqdm import tqdm
+def transutf8(symbol):
+    symbol = str(symbol.encode('utf-8'))
+    utf8_code = symbol[4:6]+symbol[8:10]+symbol[12:14]
+    return utf8_code
+def writenum(symbol):
+    url_head = 'https://bihua.51240.com/'
+    url_tail = '__bihuachaxun/'
+    # 遍历输入汉字的utf8编码，爬取对应的笔画数
+    url_mid = transutf8(symbol)
+    url = url_head + url_mid + url_tail
+    res = requests.get(url)
+    soup = BeautifulSoup(res.text, 'lxml')
+    pattern = re.compile('笔画数')
+    pattern2 = re.compile('\d{1,2}')
+    write_num = soup.find('td',text=pattern).parent.find('td', text=pattern2).get_text()
+    return write_num
+def get_dict():
+    symbol_lst = character.symbol_lst()
+    write_num_dict = {}
+    for char_one in tqdm(symbol_lst):
+        write_num_dict[char_one] = writenum(char_one)
+    return write_num_dict
+def main():
+    with open("D:/py/write_num.txt", 'w') as f:
+        for i,j in get_dict().items():
+            f.write(i+' '+j)
+        f.close()
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/write_number_dictionary.py
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/write_number_dictionary.py
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/四角编码字典（70000字）.txt
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/四角编码字典（70000字）.txt
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/四角编码字典（7000字）.txt
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/四角编码字典（7000字）.txt
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/形近字语料库.txt
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/形近字语料库.txt
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/形近字语料库（CV2）.txt
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/形近字语料库（CV2）.txt
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/笔画数字典（20000字）.txt
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/笔画数字典（20000字）.txt
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/笔画数字典（7000字）.txt
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/笔画数字典（7000字）.txt
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/结构字典.txt
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/结构字典.txt
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/结构字典（20000字）.txt
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/结构字典（20000字）.txt
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/结构字典（最新）.txt
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/结构字典（最新）.txt
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/音型码.txt
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/音型码.txt
--- a/数据治理平台线下处理/线下处理/SimilarCharactor/音近字语料库.txt
+++ b/数据治理平台线下处理/线下处理/SimilarCharactor/音近字语料库.txt
--- a/数据治理平台线下处理/线下处理/public.py
+++ b/数据治理平台线下处理/线下处理/public.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Mar 23 23:36:40 2021
+@author: rico
+"""
+import requests
+def zgc_api(func,data):
+    headers = {
+    'Connection': 'Keep-Alive'
+    }
+    key = 'eRo1#ZFHY5N&GEzV'
+    api = f"http://59.110.219.171:8000/{func}/"
+    print(api)
+    data.update({'key':key})
+    session = requests.session()
+    result = session.post(api,json=data,headers=headers,timeout=6000).json()
+    return result
+#进度条
+class Index(object):
+    def __init__(self, number=50, decimal=2):
+        """
+        :param decimal: 你保留的保留小数位
+        :param number: # 号的 个数
+        """
+        self.decimal = decimal
+        self.number = number
+        self.a = 100/number   # 在百分比 为几时增加一个 # 号
+    def __call__(self, now, total):
+        # 1. 获取当前的百分比数
+        percentage = self.percentage_number(now, total)
+        # 2. 根据 现在百分比计算
+        well_num = int(percentage / self.a)
+        # print("well_num: ", well_num, percentage)
+        # 3. 打印字符进度条
+        progress_bar_num = self.progress_bar(well_num)
+        # 4. 完成的进度条
+        result = "\r%s %s" % (progress_bar_num, percentage)
+        return result
+    def percentage_number(self, now, total):
+        """
+        计算百分比
+        :param now:  现在的数
+        :param total:  总数
+        :return: 百分
+        """
+        return round(now / total * 100, self.decimal)
+    def progress_bar(self, num):
+        """
+        显示进度条位置
+        :param num:  拼接的  “#” 号的
+        :return: 返回的结果当前的进度条
+        """
+        # 1. "#" 号个数
+        well_num = "#" * num
+        # 2. 空格的个数
+        space_num = " " * (self.number - num)
+        return '[%s%s]' % (well_num, space_num)
--- a/数据治理平台线下处理/线下处理/参数补充线下处理脚本.py
+++ b/数据治理平台线下处理/线下处理/参数补充线下处理脚本.py
--- a/数据治理平台线下处理/线下处理/基础信息线下处理脚本.py
+++ b/数据治理平台线下处理/线下处理/基础信息线下处理脚本.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Nov 15 18:05:40 2021
+@author: rico
+"""
+import sys
+sys.path.append(os.path.dirname(os.getcwd()))
+from ESCore.ES import ES_Client
+import datetime
+import pandas as pd
+import pymssql
+from public import zgc_api,Index
+import re
+#导入数据
+def import_data(es,type,path):
+    if type == 'brand':
+        deal_data_col = '人工确认品牌'
+    elif type == 'category':
+        deal_data_col = '人工确认类别'
+    elif type == 'model_nopoint' or type == 'model_nonreductice':
+        deal_data_col = '人工确认型号'
+    else:
+        print("不支持的类型")
+        return False
+    #更新品牌处理后数据
+    df = pd.read_excel(path)
+    try:
+        df['驳回原因'] = df['驳回原因'].apply(lambda x:str(x))
+    except:
+        df['驳回原因'] = ['nan' for i in range(len(df))]
+        print("无驳回原因数据")
+    process_index = 0
+    index_ = Index()
+    for index,row in df.iterrows():
+        try:
+            print(index_(process_index,len(df)-1), end='%')
+        except:
+            print(index_(process_index,1), end='%')
+        process_index+=1
+        search_field =  "ID"
+        search_field_value = str(row['ID'])
+        remark = row['驳回原因']
+        if remark != 'nan':
+            #处理驳回数据
+            now_time = str(datetime.datetime.now()).replace(' ','T').split('.')[0]+"+08:00"
+            es.UpdateReturnData(type,search_field_value,remark,now_time)
+            continue
+        #导入正常数据
+        deal_data = row[deal_data_col]
+        es.UpdateDealData(type,search_field_value,deal_data)
+#校验数据
+def check_data(es,type,path):
+    #校验品牌数据
+    if type == 'brand':
+        conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='zgcprice20200628',database= 'ZI_NEW',autocommit=True)
+        cursor_zi_new = conn_zi_new.cursor()
+        #更新品牌处理后数据
+        df = pd.read_excel(path)
+        try:
+            df['驳回原因'] = df['驳回原因'].apply(lambda x:str(x))
+        except:
+            df['驳回原因'] = ['nan' for i in range(len(df))]
+            print("无驳回原因数据")
+        process_index = 0
+        index_ = Index()
+        res_flag = True
+        for index,row in df.iterrows():
+            try:
+                print(index_(process_index,len(df)-1), end='%')
+            except:
+                print(index_(process_index,1), end='%')
+            process_index+=1
+            DOC_ID =  str(row['ID'])
+            remark = row['驳回原因']
+            if remark != 'nan':
+                continue
+            brand = str(row['人工确认品牌'])
+            cursor_zi_new.execute(f"select id from p_brand where name = '{brand}'")
+            check_result = cursor_zi_new.fetchone()
+            if not check_result:
+                res_flag = False
+                print(f"ID:'{DOC_ID}','{brand}'不是库内品牌，校验不通过")
+        if res_flag:
+            print(f"{path},该数据校验通过，可执行导入")
+        else:
+            print(f"{path},该数据校验通不过，请修改")
+        cursor_zi_new.close()
+        conn_zi_new.close()
+        return res_flag,path
+    #校验类别数据
+    elif type == 'category':
+        conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='zgcprice20200628',database= 'ZI_NEW',autocommit=True)
+        cursor_zi_new = conn_zi_new.cursor()
+        #更新品牌处理后数据
+        df = pd.read_excel(path)
+        try:
+            df['驳回原因'] = df['驳回原因'].apply(lambda x:str(x))
+        except:
+            df['驳回原因'] = ['nan' for i in range(len(df))]
+            print("无驳回原因数据")
+        process_index = 0
+        index_ = Index()
+        res_flag = True
+        for index,row in df.iterrows():
+            try:
+                print(index_(process_index,len(df)-1), end='%')
+            except:
+                print(index_(process_index,1), end='%')
+            process_index+=1
+            DOC_ID =  str(row['ID'])
+            remark = row['驳回原因']
+            if remark != 'nan':
+                continue
+            category = str(row['人工确认类别'])
+            cursor_zi_new.execute(f"select id from p_category where name = '{category}' and id not in (select DISTINCT pid from p_category)")
+            check_result = cursor_zi_new.fetchone()
+            if not check_result:
+                res_flag = False
+                print(f"ID:'{DOC_ID}','{category}'不是库内类别，或不是末级类别，校验不通过")
+        if res_flag:
+            print(f"{path},该数据校验通过，可执行导入")
+        else:
+            print(f"{path},该数据校验通不过，请修改")
+        cursor_zi_new.close()
+        conn_zi_new.close()
+        return res_flag,path
+    #校验型号数据
+    elif type == 'model_nopoint' or type == 'model_nonreductice':
+        conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='zgcprice20200628',database= 'ZI_NEW',autocommit=True)
+        cursor_zi_new = conn_zi_new.cursor()
+        #更新品牌处理后数据
+        df = pd.read_excel(path)
+        try:
+            df['驳回原因'] = df['驳回原因'].apply(lambda x:str(x))
+        except:
+            df['驳回原因'] = ['nan' for i in range(len(df))]
+            print("无驳回原因数据")
+        process_index = 0
+        index_ = Index()
+        res_flag = True
+        res_list = []
+        for index,row in df.iterrows():
+            try:
+                print(index_(process_index,len(df)-1), end='%')
+            except:
+                print(index_(process_index,1), end='%')
+            process_index+=1
+            DOC_ID =  str(row['ID'])
+            category = str(row['结果产品末级类名称'])
+            brand = str(row['结果产品品牌名称'])
+            remark = row['驳回原因']
+            if remark != 'nan':
+                res_list.append('驳回数据')
+                continue
+            model = str(row['人工确认型号'])
+            model_alias = "".join(re.findall(r"[\u4e00-\u9fa5A-Za-z0-9+.]+", model)).upper()
+            cursor_zi_new.execute(f"select model from sku_model where model_alias = '{model_alias}' and categoryname = '{category}' and brandname = '{brand}'")
+            check_result = cursor_zi_new.fetchone()
+            if not check_result:
+                res_flag = False
+                print(f"ID:'{DOC_ID}','{model}'不是型号表内型号，校验不通过")
+                res_list.append('型号表外型号，请添加')
+            else:
+                res_list.append(check_result[0])
+        if res_flag:
+            df['人工确认型号'] = res_list
+            pass_path = path.split(".")[0] + "(校验通过)." + path.split(".")[1]
+            df.to_excel(pass_path)
+            print(f"{pass_path},该数据校验通过，可执行导入")
+            cursor_zi_new.close()
+            conn_zi_new.close()
+            return res_flag,pass_path
+        else:
+            print(f"{path},该数据校验通不过，请修改")
+            cursor_zi_new.close()
+            conn_zi_new.close()
+            return res_flag,path
+    else:
+        print("不支持的类型")
+        return False,path
+#创建ES客户端
+host = "http://123.56.114.138:9200/"
+index_name = "model_params_test"
+es = ES_Client(host,index_name)
+#请选择要校验及更新的数据类型
+#1、品牌提取与标化：brand
+#2、类别提取与标化：category
+#3、非重点类型号提取与标化：model_nopoint
+#4、非还原类型号提取与标化：model_nonreductice
+#eg. type = 'brand'
+type = 'model_nonreductice'
+#补充完成的文件路径（若有驳回原因，请添加列名为：“驳回原因”列，填写驳回原因）
+path = "/Users/rico/Downloads/扫描仪型号标化_第一批(1).xlsx"
+#校验文件内容
+check_status,pass_path = check_data(es,type,path)
+#导入数据（校验通过后才可导入！！！）
+if check_status:
+    import_data(es,type,pass_path)
\ No newline at end of file
--- a/数据治理平台线下处理/线下建库/db.py
+++ b/数据治理平台线下处理/线下建库/db.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Sep  7 23:08:30 2020
+@author: rico
+"""
+import pymssql
+import pymysql
+import oss2
+import os
+import datetime
+class MSSQL:
+    def __init__(self,host,db):
+        self.host = host
+        self.db = db
+        self._conn = self.GetConnect()
+        if(self._conn):
+            self._cur = self._conn.cursor()
+    #连接数据库    
+    def GetConnect(self):
+        conn = False
+        if self.host == '123.57.45.119':
+            user = 'zgcprice'
+            pwd = 'zgcprice20200708'
+        elif self.host == '123.56.115.207':
+            user = 'zgcindex'
+            pwd = 'jiayou202006'
+        elif self.host == '10.0.120.131':
+            user = 'sa'
+            pwd = '1qaz@WSX'
+        elif self.host == '10.0.120.79':
+            user = 'sa'
+            pwd = '1qaz@WSX'
+        elif self.host == '39.107.254.235':
+            user = 'sa'
+            pwd = '1qaz@WSX'
+        try:
+            conn = pymssql.connect(
+                host=self.host,
+                user=user,
+                password=pwd,
+                database =self.db,
+                autocommit=True
+            )
+        except Exception as err:
+            print("连接数据库失败, %s" % err)
+        else:
+            return conn
+    #获取连接信息
+    def GetConnectInfo(self):
+        print( "连接信息：" )
+        print( "服务器:%s , 用户名:%s , 数据库:%s " % (self.host,self.user,self.db))
+    def Close(self):
+        self._cur.close()
+        self._conn.close()
+class MYSQL:
+    def __init__(self,host,port,db):
+        self.host = host
+        self.port = port
+        self.db = db
+        self._conn = self.GetConnect()
+        if(self._conn):
+            self._cur = self._conn.cursor()
+    #连接数据库    
+    def GetConnect(self):
+        conn = False
+        if self.host == '39.105.1.55':
+            user = 'root'
+            pwd = 'l*C#70CIAxgb6c%'
+        try:
+            conn = pymysql.connect(
+                host=self.host,
+                port=self.port,
+                user=user,
+                password=pwd,
+                database =self.db,
+                charset="utf8",
+                autocommit=True
+            )
+        except Exception as err:
+            print("连接数据库失败, %s" % err)
+        else:
+            return conn
+    #获取连接信息
+    def GetConnectInfo(self):
+        print( "连接信息：" )
+        print( "服务器:%s , 用户名:%s , 数据库:%s " % (self.host,self.user,self.db))
+    def Close(self):
+        self._cur.close()
+        self._conn.close()
+'''
+ms = MSSQL('123.56.115.207','zdindex')
+conn = ms._conn
+cursor = ms._cur
+cursor.execute(f"select top 10 *  from zd_week_price")
+cursor.fetchall()
+ms.Close()
+cursor.close()
+conn.close()
+'''
+class OSS(object):
+    """定义一个简单的oss操作类，支持文件上传和下载"""
+    def __init__(self, accessKey_id, accessKey_secret, endpoint, bucket_name):
+        self.auth = oss2.Auth(accessKey_id, accessKey_secret)
+        self.bucket = oss2.Bucket(self.auth, endpoint, bucket_name)
+    def download_from_oss(self, oss_folder_prefix, object_name, local_save_path):
+        """拼接本地保存时的文件路径，且保持oss中指定目录以下的路径层级"""
+        oss_path_prefix = object_name.split(oss_folder_prefix)[-1]  # oss原始路径,以'/'为路径分隔符
+        oss_path_prefix = os.sep.join(oss_path_prefix.strip('/').split('/'))  # 适配win平台
+        local_file_path = os.path.join(local_save_path, oss_path_prefix)
+        local_file_prefix = local_file_path[:local_file_path.rindex(os.sep)]  # 本地保存文件的前置路径，如果不存在需创建
+        if not os.path.exists(local_file_prefix):
+            os.makedirs(local_file_prefix)
+        self.bucket.get_object_to_file(object_name, local_file_path)
+    def upload_to_oss(self, prefix, suffix, local_upload_path):
+        """上传指定路径下的目录或文件，如果oss路径不存在，则自动创建"""
+        # 当前日期时间作为最新上传的目录名
+        folder_name = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
+        oss_upload_prefix = prefix.rstrip('/') + '/' + folder_name
+        # 遍历指定上传目录文件，并上传
+        for root, dirs, files in os.walk(local_upload_path):
+            local_upload_path = local_upload_path.rstrip(os.sep)  # 去除外部输入时结尾可能带入的路径符号
+            for file in files:
+                file_path = os.path.join(root, file)
+                relative_file_path = file_path.split(local_upload_path)[1]  # 保持upload目录下的路径层级
+                relative_file_path = relative_file_path.strip(os.sep)
+                oss_relative_path = relative_file_path.replace(os.sep, '/')  # 转换成oss的路径格式，适配linux\win
+                oss_upload_path = oss_upload_prefix + '/' + oss_relative_path
+                # 上传该文件
+                if file.endswith(suffix):
+                    self.bucket.put_object_from_file(oss_upload_path, file_path)
+    def travel_download(self, prefix, suffix, local_save_path):
+        """
+        :param prefix: oss目录前缀，即遍历以prefix开头的文件
+        :param suffix: 文件后缀名，如，.csv，指定下载何种类型的文件
+        :param local_save_path: 下载文件的保存路径
+        :return:
+        """
+        # 下载指定目录下的指定后缀的文件，且保存时维持目录层级格式
+        # 列举指定prefix目录下的层级目录，定位到目标目录后，再做深度遍历
+        local_save_path = local_save_path.rstrip(os.sep)  # 去除外部输入时结尾可能带入的路径符号
+        top_level_folder = []
+        for obj in oss2.ObjectIterator(self.bucket, prefix=prefix, delimiter='/'):
+            if obj.is_prefix():
+                # 目录
+                top_level_folder.append(obj.key)
+            else:
+                # 文件
+                pass
+        # 获取最近一次更新的目录,并下载该目录及其子目录下指定后缀的文件
+        target_folder = max(top_level_folder)
+        for obj in oss2.ObjectIterator(self.bucket, prefix=target_folder):
+            if obj.is_prefix():
+                # 目录
+                continue
+            else:
+                # 只下载指定后缀的文件，oss中xxx/xxx/也会被认为是文件，根据prefix而定
+                if obj.key.endswith(suffix):
+                    # 下载
+                    self.download_from_oss(target_folder, obj.key, local_save_path)
\ No newline at end of file
--- a/数据治理平台线下处理/线下建库/stockInfo.py
+++ b/数据治理平台线下处理/线下建库/stockInfo.py
--- a/数据治理平台线下处理/线下建库/zgcindex_data2DB.py
+++ b/数据治理平台线下处理/线下建库/zgcindex_data2DB.py
--- a/数据治理平台线下处理/线下建库/zgcindex_sync_excel2ob.py
+++ b/数据治理平台线下处理/线下建库/zgcindex_sync_excel2ob.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Nov 22 22:38:22 2021
+@author: rico
+"""
+from db import MYSQL,MSSQL,OSS
+import pandas as pd
+path = "/Users/rico/WorkSpace/1_Project/Company/中电中采/TEMP Workspace/ES处理相关/ES线下补参处理/（已确认）投影机结果202111241215280008.xlsx"
+df = pd.read_excel(path,converters = {'FINISH_P_SKU':str})
+export_df = df[['DOC_ID','DATA_BATCH','DATA_END_TIME','FLOW_NODE_STATUS','SOURCE_P_SKU','SOURCE_CHANNEL_NAME','SOURCE_CHANNEL_SNAME','SOURCE_P_NAME',
+                'SOURCE_P_LASTCATEGORY_NAME','SOURCE_P_BRAND_NAME','SOURCE_P_PRICE','SOURCE_P_URL','FINISH_P_REMARK','FINISH_P_BRAND_NAME','FINISH_P_BRAND_CODE','FINISH_P_LASTCATEGORY_NAME',
+                'FINISH_P_LASTCATEGORY_CODE','FINISH_P_MODEL','FINISH_P_SKU','FINISH_P_PARAMS','FINISH_P_NAME','FINISH_P_STATUS',
+                'FINISH_P_CHECK_STATUS']]
+export_df = df[['DOC_ID','DATA_BATCH','DATA_END_TIME','FLOW_NODE_STATUS','SOURCE_P_SKU','SOURCE_CHANNEL_NAME','SOURCE_CHANNEL_SNAME','SOURCE_P_NAME',
+                'SOURCE_P_LASTCATEGORY_NAME','SOURCE_P_BRAND_NAME','SOURCE_P_PRICE','SOURCE_P_URL','FINISH_P_REMARK','FINISH_P_BRAND_NAME','FINISH_P_BRAND_CODE','FINISH_P_LASTCATEGORY_NAME',
+                'FINISH_P_LASTCATEGORY_CODE','FINISH_P_PARAMS','FINISH_P_NAME',
+                'FINISH_P_CHECK_STATUS']]
+export_df = export_df.fillna('无')
+mysql = MYSQL('39.105.1.55',2883,'ZD_PUBLIC_pro')
+cursor = mysql._cur
+#同步数据至DW层
+tablename = "DW_PRODUCT_ALL_RES"
+cols = ','.join(export_df.columns)
+val = (tuple(i) for i in export_df.values)
+sqlstr = "INSERT INTO {} ({}) VALUES ({})".format(tablename,cols,','.join(['%s']*len(export_df.columns)))
+try:
+    cursor.executemany(sqlstr, val)
+    print('>>> 插入数据成功，表 {} 共插入 {} 行数据'.format(tablename,len(export_df)))
+except Exception as e:
+    print('>>> 插入数据失败', e)
+df= pd.read_excel("/Users/rico/WorkSpace/1_Project/Company/中电中采/TEMP Workspace/ES处理相关/ES线下补参处理/安徽三大类结果1119(1).xlsx")
+for index,row in df.iterrows():
+    doc_id = row['DOC_ID']
+    name = row['SOURCE_P_LASTCATEGORY_NAME']
+    brand = row['SOURCE_P_BRAND_NAME']
+    cursor.execute(f"update DW_PRODUCT_ALL_RES set SOURCE_P_LASTCATEGORY_NAME = '{name}', SOURCE_P_BRAND_NAME = '{brand}' where DOC_ID = '{doc_id}'")
\ No newline at end of file