第一次上传代码到gitlab！

22790607 · Zhouxingyu · 22790607 · 22790607 · 22790607 · 22790607
Commit 22790607 authored Aug 06, 2019 by Zhouxingyu
6 changed files
--- a/__pycache__/data_add_class.cpython-35.pyc
+++ b/__pycache__/data_add_class.cpython-35.pyc
--- a/__pycache__/functions.cpython-35.pyc
+++ b/__pycache__/functions.cpython-35.pyc
--- a/data_add_class.py
+++ b/data_add_class.py
+import xlrd
+import pandas as pd
+import numpy as np
+from functions import *
+class Data_add():
+    def __init__(self,table_path,out_path,pcode_col,brand_col,price_col,type_col,ser_col,data_col):
+        '''
+        依次输入表格路径，输出路径，productcode，品牌，价格，型号，系列对应的列号,以及一个原始数据或者不需要预测数据的列号。
+        '''
+        self._table = data_load(table_path).reset_index(drop = True)
+        self._path_out = out_path
+        self._data_col = data_col
+        self._pcode_col = pcode_col
+        self._brand_col = brand_col
+        self._price_col = price_col
+        self._type_col = type_col
+        self._ser_col = ser_col
+    def data_add_main(self):
+        print('数据装填完毕！')
+        some_lyst=[self._data_col,self._pcode_col,self._price_col,self._brand_col,self._type_col]
+        n = self._table.shape[1]   #列
+        lyst=[]
+        for i in range(n):
+            if in_or_out(some_lyst,i):
+                lyst.append(i)
+                continue
+            else:
+                lyst_null=moudle11(self._table,i,self._pcode_col)
+                for j in range(len(lyst_null)):
+                    table=data_add(self._table,lyst_null[j],i,self._pcode_col,self._brand_col,self._price_col,self._type_col,self._ser_col)
+        print("填写空缺值完毕！")
+        pre(table,self._path_out)
\ No newline at end of file
--- a/functions.py
+++ b/functions.py
+import difflib
+import re
+import pandas as pd
+import xlrd
+def str_split(string):
+    '''
+    字符分割。
+    '''
+    pattern = re.compile('.{1}')
+    string=str(' '.join(pattern.findall(string)))
+    return string.split()
+def is_alphabet(uchar):
+    '''
+    判断一个unicode是否是英文字母
+    '''
+    if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
+        return True
+    else:
+        return False
+def string_similar(s1, s2):
+    '''
+    字符串相似度计算，此处算法有待改进，有更好的算法可以直接替代此处。
+    '''
+    return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
+def pcode2Line_num(table,pcode,col=2):
+    '''
+    productcode转行号，请根据pcode的实际列数设置col的数值。
+    '''
+    data_col = list(table.iloc[:, col])
+    n= data_col.index(pcode)
+    return n
+def moudle8(table,cand_pcode,pre_pcode,col):
+    '''
+    此模块为字符匹配，table为原始表格，cand_pcode为候选的产品productcode，pre_pcode为空缺待预测的产品prodectcode，为单一的字符串。
+    '''
+    p=string_similar(table.iloc[pcode2Line_num(table,cand_pcode),col],table.iloc[pcode2Line_num(table,pre_pcode),col])
+    return p
+def moudle9(table,mat_pcode,pre_pcode,col):
+    '''
+    此模块为预测数据填写，利用产品的mat_productcode匹配数据填写入待预测数据pre_pcode的相应列col中。
+    '''
+    #if mat_pcode !=pre_pcode:
+        #print("已经把表格中 [",pcode2Line_num(table,pre_pcode),',',col,"] 空缺的参数替换为 [",pcode2Line_num(table,mat_pcode),",",col,"] 的参数。")
+    table.iloc[pcode2Line_num(table,pre_pcode),col]=table.iloc[pcode2Line_num(table,mat_pcode),col]
+    return table
+def moudle10(table,pre_pcode,ser_col=15,type_col=9):
+    '''
+    ser_col为系列号，type_col为型号。函数将型号按照规律赋给系列号。
+    '''
+    str2pre=str_split(str(table.iloc[pcode2Line_num(table,pre_pcode),type_col]))
+    str2out=[]
+    if is_alphabet(str2pre[0])==False:
+        if len(str2pre)<3:
+            for i in range(len(str2pre)):
+                str2out.append(str2pre[i])
+        else:
+            for i in range(3):
+                str2out.append(str2pre[i])
+    else:       
+        for i in range(len(str2pre)):
+            if is_alphabet(str2pre[i]):
+                str2out.append(str2pre[i])
+            else: break
+    str2out=''.join(str2out)
+    table.iloc[pcode2Line_num(table,pre_pcode),ser_col]=str2out
+    return table
+def moudle7(table,productcode,col):
+    '''
+    提取相应productcode的对应列数据。
+    '''
+    return table.iloc[col,pcode2Line_num(table,productcode)]
+def moudle6(old_table,table,pre_pcode,type_col,pcode_col):
+    '''
+    old_code是原始表格，table是筛选后表格，pre_pcode是数据缺失值的pcode，type_col是型号的列号，输出为匹配度最高的pcode。
+    '''
+    pcode_col_data = list(table.iloc[:, pcode_col])
+    p_max=moudle8(old_table,pcode_col_data[0],pre_pcode,type_col)
+    pcode_max=pcode_col_data[0]
+    for i in range(1,len(pcode_col_data)):
+        p=moudle8(old_table,pcode_col_data[i],pre_pcode,type_col)
+        if p>p_max:
+            pcode_max=pcode_col_data[i]
+            p_max=p
+    return pcode_max
+def moudle5(old_table,table,p_range,pre_pcode,price_col):
+    '''
+    table为待筛选表格，range为0-1之间的浮动范围，pre_pcode为待填写的号，price_col为价格的列号。返回去除了不在区间内产品的表格。
+    '''
+    price_col_data = list(table.iloc[:, price_col])           #选取的价格列表
+    price_col_data = list(map(lambda x:float(x), price_col_data))          #字符串转化为浮点型
+    price_line=int(old_table.iloc[pcode2Line_num(old_table,pre_pcode),price_col])
+    n=[]
+    for i in range(len(price_col_data)):
+        if price_col_data[i] > price_line*(1+p_range) or price_col_data[i] < price_line*(1-p_range):
+            n.append(i)
+    if len(n) == len(price_col_data):
+        D_value_min = abs(price_col_data[0]-price_line)
+        min_num=0
+        for j in range(len(price_col_data)):
+            D_value = abs(price_col_data[j]-price_line)
+            if D_value < D_value_min:
+                min_num = j
+                D_value_min = D_value
+        n.pop(min_num)
+    new_table=table.drop(n)
+    return new_table
+def moudle5_old(old_table,table,p_range,pre_pcode,price_col):
+    '''
+    table为待筛选表格，range为0-1之间的浮动范围，pre_pcode为待填写的号，price_col为价格的列号。返回去除了不在区间内产品的表格。
+    '''
+    price_col_data = list(table.iloc[:, price_col])
+    price_col_data = list(map(lambda x:float(x), price_col_data))          #字符串转化为浮点型
+    price_line=int(old_table.iloc[pcode2Line_num(old_table,pre_pcode),price_col])
+    n=[]
+    for i in range(len(price_col_data)):
+        if price_col_data[i] > price_line*(1+p_range) or price_col_data[i] < price_line*(1-p_range):
+            n.append(i)
+    new_table=table.drop(n)
+    return new_table
+def moudle4(table,row,col):
+    '''
+    数据是否为空判断。
+    '''
+    data=table.iloc[row,col]    #把row行col列的数据提取出来。如果为空或无，则输出‘1’。
+    if data=='NULL' or data=='nan' or data=='NAN' or data=='null' or data=='NA' or data=='':
+        return True
+    else: return False
+def moudle3(old_table,table,pre_pcode,price_col,p_range=0.5):
+    '''
+    通过表格和待预测数据pcode与价格列号，通过递归实现寻找一定行数的价格近似产品价格区间。
+    '''
+    if p_range == 0:
+        print("匹配数据太多！返回range=1")
+        return p_range
+    if p_range == 2:
+        print("匹配数据太少！range=2时也没有匹配数据！")
+        return p_range
+    price_line=int(old_table.iloc[pcode2Line_num(old_table,pre_pcode),price_col])
+    price_col_data = list(table.iloc[:, price_col])####???
+    price_col_data = list(map(lambda x:float(x), price_col_data))
+    m=0
+    n=len(price_col_data)
+    for j in range(len(price_col_data)):
+         if price_col_data[j] <= price_line*(1+p_range) and price_col_data[j] >= price_line*(1-p_range):
+            m+=1
+    if m > 10 or m == n:
+         p_range-=0.05
+         moudle3(old_table,table,pre_pcode,price_col,p_range)
+    if m < n//3 or m == 0:
+         p_range+=0.05
+         moudle3(old_table,table,pre_pcode,price_col,p_range)
+    return p_range
+def moudle2(old_table,table,pre_pcode,brand_col):
+    '''
+    oldtable为原始表格，table是筛选后表格，pre_pcode为待填写数据pcode，brand_col为品牌的列号。
+    '''
+    n=pcode2Line_num(old_table,pre_pcode)
+    pre_pcode_brand=old_table.iloc[n,brand_col]
+    brand_lyst=list(table.iloc[:, brand_col])
+    lyst=[]
+    for i in range ( len(brand_lyst)):
+        if brand_lyst[i] != pre_pcode_brand:
+            lyst.append(i)
+    if len(lyst)==0:
+        print("表格中没有该品牌产品！返回原表格。")
+        return table
+    new_table=table.drop(lyst)
+    return new_table
+def moudle1(table,col):
+    '''
+    输入表格和列号，返回列中元素不为无或null的table。
+    '''
+    all_lyst=list(table.iloc[:, col])
+    lyst=[]
+    for i in range(len(all_lyst)):
+        if all_lyst[i] == 'NULL' or all_lyst[i] == 'nan' or all_lyst[i]=='NAN' or all_lyst[i]=='null' or all_lyst[i]=='NA' or all_lyst[i]=='':
+            lyst.append(i)
+    new_table=table.drop(lyst)
+    return new_table
+def moudle11(table,col,pcode_col):
+    '''
+    返回指定col列为空或者null的pcode。
+    '''
+    all_lyst=list(table.iloc[:, col])
+    pcode_lyst=list(table.iloc[:, pcode_col])
+    lyst=[]
+    for i in range(len(all_lyst)):
+        if all_lyst[i] == 'NULL' or all_lyst[i] == 'nan' or all_lyst[i]=='NAN' or all_lyst[i]=='null' or all_lyst[i]=='NA' or all_lyst[i]=='':
+            lyst.append(pcode_lyst[i])
+    return lyst
+def data_load(path):
+    data = xlrd.open_workbook(path)
+    table = data.sheets()[0]
+    ncols = table.ncols
+    data2 = pd.DataFrame([])
+    for i in range(ncols):
+        data2[i] = table.col_values(i)
+    data2.rename(columns=data2.iloc[0, :], inplace=True)
+    data2.drop([0], axis=0, inplace=True)
+    return data2
+def in_or_out(lyst,n):
+    for i in range(len(lyst)):
+        if n == lyst[i]:
+            return True
+    else: return False
+def pre(matrix_text, addr):
+    """将数据导出
+    matrix_text：导出数据
+    addr：导出地址
+    """
+    matrix_text.to_csv(addr, sep=',', index=0, encoding='utf_8_sig', columns=matrix_text.columns)
+def xlsx_to_csv_pd(csv_path,xlsx_path):
+    data_xls = pd.read_excel(csv_path, index_col=0)
+    data_xls.to_csv(xlsx_path, encoding='utf-8')
+def moudle12(old_table,pre_pcode,price_col,pcode_col):
+    '''
+    在整个品牌都缺失某项参数，将价格最相近的产品的参数赋予这个空值。
+    '''
+    price_col_data = list(old_table.iloc[:, price_col])
+    price_col_data = list(map(lambda x:float(x), price_col_data))          #字符串转化为浮点型
+    pcode_col_data = list(old_table.iloc[:, pcode_col])
+    price_line=int(old_table.iloc[pcode2Line_num(old_table,pre_pcode),price_col])
+    max_row=0
+    min_sub=abs(price_col_data[max_row]-price_line)
+    for i in range(1,len(price_col_data)):
+        sub = abs(price_col_data[i]-price_line)
+        if sub < min_sub:
+            min_sub = sub
+            max_row = i
+    fit_pcode = pcode_col_data[max_row]
+    return fit_pcode
+def na(data_matrix, h):
+    """随机赋空h个值"""
+    import numpy as np
+    data_nan = data_matrix.copy()
+    xx = np.random.randint(data_matrix.shape[0], size=h)
+    yy = np.random.randint(data_matrix.shape[1], size=h)
+    for i in range(h):
+        data_nan.iloc[xx[i], yy[i]] = 0
+    return data_nan, xx, yy
+def if_none(table,):
+    lyst=list(table.iloc[:, 0])
+    if len(lyst) == 0:
+        return True
+def data_add(table,pcode,col,pcode_col,brand_col,price_col,type_col,ser_col):
+    table_new=table
+    table_new=moudle2(table,table_new,pcode,brand_col)                           #原始表格序号从0开始排。
+    if if_none(table_new):
+        return table
+    table_new = table_new.reset_index(drop=True)           #reset后序号从0开始排。
+    table_new=moudle1(table_new,col)
+    if if_none(table_new):
+        '''
+        此处添加价格判断函数，寻找所有品牌价格最相近行。
+        '''
+        if col == ser_col:
+            table=moudle10(table,pcode,ser_col,type_col)
+        else:
+            match_max_pcode = moudle12(table,pcode,price_col,pcode_col)
+            table=moudle9(table,match_max_pcode,pcode,col)
+        return table
+    table_new = table_new.reset_index(drop=True)           #reset
+    p=0.5
+    table_new=moudle5(table,table_new,p,pcode,price_col)
+    if if_none(table_new):
+        return table
+    table_new = table_new.reset_index(drop=True)               #reset
+    match_max_pcode=moudle6(table,table_new,pcode,type_col,pcode_col)
+    table=moudle9(table,match_max_pcode,pcode,col)
+    return table
\ No newline at end of file
--- a/lib_nonstand-stand_price.xlsx
+++ b/lib_nonstand-stand_price.xlsx
--- a/main.py
+++ b/main.py
+from data_add_class import Data_add
+def main():
+    '''
+    table_path = input("请输入excel文件地址：")
+    out_path = input("请输入excel文件输出地址：")
+    pcode_col = input("请输出 productcode 在第几列：")
+    price_col = input("请输入价格在第几列：")
+    brand_col = input("请输入品牌在第几列：")
+    type_col = input("请输入型号在第几列：")
+    ser_col = input("请输入系列号在第几列：")
+    data_col = input("请输入一列自订的不需要填补列的列号:")
+    '''
+    table_path = ("lib_nonstand-stand_price.xlsx")
+    out_path = ("lib_nonstand-stand_new.csv")
+    data_col=int('3')   #输入文本信息，可以为网页url，也可以为其他不需要预测的信息列。
+    pcode_col=int("2")  #输入productcode列号或者sku列号。
+    price_col=int("47")  #输入价格所在列号。
+    brand_col=int("4")  #输入品牌所在列号
+    type_col=int("38")   #输入型号所在列号
+    ser_col=int("44")   #输入系列所在列号
+    a = Data_add(table_path,out_path,pcode_col,brand_col,price_col,type_col,ser_col,data_col)
+    a.data_add_main()
+if __name__ == "__main__":
+    main()
\ No newline at end of file