Commit 22790607 authored by Zhouxingyu's avatar Zhouxingyu

第一次上传代码到gitlab!

parents
Pipeline #4 canceled with stages
import xlrd
import pandas as pd
import numpy as np
from functions import *
class Data_add():
def __init__(self,table_path,out_path,pcode_col,brand_col,price_col,type_col,ser_col,data_col):
'''
依次输入表格路径,输出路径,productcode,品牌,价格,型号,系列对应的列号,以及一个原始数据或者不需要预测数据的列号。
'''
self._table = data_load(table_path).reset_index(drop = True)
self._path_out = out_path
self._data_col = data_col
self._pcode_col = pcode_col
self._brand_col = brand_col
self._price_col = price_col
self._type_col = type_col
self._ser_col = ser_col
def data_add_main(self):
print('数据装填完毕!')
some_lyst=[self._data_col,self._pcode_col,self._price_col,self._brand_col,self._type_col]
n = self._table.shape[1] #列
lyst=[]
for i in range(n):
if in_or_out(some_lyst,i):
lyst.append(i)
continue
else:
lyst_null=moudle11(self._table,i,self._pcode_col)
for j in range(len(lyst_null)):
table=data_add(self._table,lyst_null[j],i,self._pcode_col,self._brand_col,self._price_col,self._type_col,self._ser_col)
print("填写空缺值完毕!")
pre(table,self._path_out)
\ No newline at end of file
import difflib
import re
import pandas as pd
import xlrd
def str_split(string):
'''
字符分割。
'''
pattern = re.compile('.{1}')
string=str(' '.join(pattern.findall(string)))
return string.split()
def is_alphabet(uchar):
'''
判断一个unicode是否是英文字母
'''
if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
return True
else:
return False
def string_similar(s1, s2):
'''
字符串相似度计算,此处算法有待改进,有更好的算法可以直接替代此处。
'''
return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
def pcode2Line_num(table,pcode,col=2):
'''
productcode转行号,请根据pcode的实际列数设置col的数值。
'''
data_col = list(table.iloc[:, col])
n= data_col.index(pcode)
return n
def moudle8(table,cand_pcode,pre_pcode,col):
'''
此模块为字符匹配,table为原始表格,cand_pcode为候选的产品productcode,pre_pcode为空缺待预测的产品prodectcode,为单一的字符串。
'''
p=string_similar(table.iloc[pcode2Line_num(table,cand_pcode),col],table.iloc[pcode2Line_num(table,pre_pcode),col])
return p
def moudle9(table,mat_pcode,pre_pcode,col):
'''
此模块为预测数据填写,利用产品的mat_productcode匹配数据填写入待预测数据pre_pcode的相应列col中。
'''
#if mat_pcode !=pre_pcode:
#print("已经把表格中 [",pcode2Line_num(table,pre_pcode),',',col,"] 空缺的参数替换为 [",pcode2Line_num(table,mat_pcode),",",col,"] 的参数。")
table.iloc[pcode2Line_num(table,pre_pcode),col]=table.iloc[pcode2Line_num(table,mat_pcode),col]
return table
def moudle10(table,pre_pcode,ser_col=15,type_col=9):
'''
ser_col为系列号,type_col为型号。函数将型号按照规律赋给系列号。
'''
str2pre=str_split(str(table.iloc[pcode2Line_num(table,pre_pcode),type_col]))
str2out=[]
if is_alphabet(str2pre[0])==False:
if len(str2pre)<3:
for i in range(len(str2pre)):
str2out.append(str2pre[i])
else:
for i in range(3):
str2out.append(str2pre[i])
else:
for i in range(len(str2pre)):
if is_alphabet(str2pre[i]):
str2out.append(str2pre[i])
else: break
str2out=''.join(str2out)
table.iloc[pcode2Line_num(table,pre_pcode),ser_col]=str2out
return table
def moudle7(table,productcode,col):
'''
提取相应productcode的对应列数据。
'''
return table.iloc[col,pcode2Line_num(table,productcode)]
def moudle6(old_table,table,pre_pcode,type_col,pcode_col):
'''
old_code是原始表格,table是筛选后表格,pre_pcode是数据缺失值的pcode,type_col是型号的列号,输出为匹配度最高的pcode。
'''
pcode_col_data = list(table.iloc[:, pcode_col])
p_max=moudle8(old_table,pcode_col_data[0],pre_pcode,type_col)
pcode_max=pcode_col_data[0]
for i in range(1,len(pcode_col_data)):
p=moudle8(old_table,pcode_col_data[i],pre_pcode,type_col)
if p>p_max:
pcode_max=pcode_col_data[i]
p_max=p
return pcode_max
def moudle5(old_table,table,p_range,pre_pcode,price_col):
'''
table为待筛选表格,range为0-1之间的浮动范围,pre_pcode为待填写的号,price_col为价格的列号。返回去除了不在区间内产品的表格。
'''
price_col_data = list(table.iloc[:, price_col]) #选取的价格列表
price_col_data = list(map(lambda x:float(x), price_col_data)) #字符串转化为浮点型
price_line=int(old_table.iloc[pcode2Line_num(old_table,pre_pcode),price_col])
n=[]
for i in range(len(price_col_data)):
if price_col_data[i] > price_line*(1+p_range) or price_col_data[i] < price_line*(1-p_range):
n.append(i)
if len(n) == len(price_col_data):
D_value_min = abs(price_col_data[0]-price_line)
min_num=0
for j in range(len(price_col_data)):
D_value = abs(price_col_data[j]-price_line)
if D_value < D_value_min:
min_num = j
D_value_min = D_value
n.pop(min_num)
new_table=table.drop(n)
return new_table
def moudle5_old(old_table,table,p_range,pre_pcode,price_col):
'''
table为待筛选表格,range为0-1之间的浮动范围,pre_pcode为待填写的号,price_col为价格的列号。返回去除了不在区间内产品的表格。
'''
price_col_data = list(table.iloc[:, price_col])
price_col_data = list(map(lambda x:float(x), price_col_data)) #字符串转化为浮点型
price_line=int(old_table.iloc[pcode2Line_num(old_table,pre_pcode),price_col])
n=[]
for i in range(len(price_col_data)):
if price_col_data[i] > price_line*(1+p_range) or price_col_data[i] < price_line*(1-p_range):
n.append(i)
new_table=table.drop(n)
return new_table
def moudle4(table,row,col):
'''
数据是否为空判断。
'''
data=table.iloc[row,col] #把row行col列的数据提取出来。如果为空或无,则输出‘1’。
if data=='NULL' or data=='nan' or data=='NAN' or data=='null' or data=='NA' or data=='':
return True
else: return False
def moudle3(old_table,table,pre_pcode,price_col,p_range=0.5):
'''
通过表格和待预测数据pcode与价格列号,通过递归实现寻找一定行数的价格近似产品价格区间。
'''
if p_range == 0:
print("匹配数据太多!返回range=1")
return p_range
if p_range == 2:
print("匹配数据太少!range=2时也没有匹配数据!")
return p_range
price_line=int(old_table.iloc[pcode2Line_num(old_table,pre_pcode),price_col])
price_col_data = list(table.iloc[:, price_col])####???
price_col_data = list(map(lambda x:float(x), price_col_data))
m=0
n=len(price_col_data)
for j in range(len(price_col_data)):
if price_col_data[j] <= price_line*(1+p_range) and price_col_data[j] >= price_line*(1-p_range):
m+=1
if m > 10 or m == n:
p_range-=0.05
moudle3(old_table,table,pre_pcode,price_col,p_range)
if m < n//3 or m == 0:
p_range+=0.05
moudle3(old_table,table,pre_pcode,price_col,p_range)
return p_range
def moudle2(old_table,table,pre_pcode,brand_col):
'''
oldtable为原始表格,table是筛选后表格,pre_pcode为待填写数据pcode,brand_col为品牌的列号。
'''
n=pcode2Line_num(old_table,pre_pcode)
pre_pcode_brand=old_table.iloc[n,brand_col]
brand_lyst=list(table.iloc[:, brand_col])
lyst=[]
for i in range ( len(brand_lyst)):
if brand_lyst[i] != pre_pcode_brand:
lyst.append(i)
if len(lyst)==0:
print("表格中没有该品牌产品!返回原表格。")
return table
new_table=table.drop(lyst)
return new_table
def moudle1(table,col):
'''
输入表格和列号,返回列中元素不为无或null的table。
'''
all_lyst=list(table.iloc[:, col])
lyst=[]
for i in range(len(all_lyst)):
if all_lyst[i] == 'NULL' or all_lyst[i] == 'nan' or all_lyst[i]=='NAN' or all_lyst[i]=='null' or all_lyst[i]=='NA' or all_lyst[i]=='':
lyst.append(i)
new_table=table.drop(lyst)
return new_table
def moudle11(table,col,pcode_col):
'''
返回指定col列为空或者null的pcode。
'''
all_lyst=list(table.iloc[:, col])
pcode_lyst=list(table.iloc[:, pcode_col])
lyst=[]
for i in range(len(all_lyst)):
if all_lyst[i] == 'NULL' or all_lyst[i] == 'nan' or all_lyst[i]=='NAN' or all_lyst[i]=='null' or all_lyst[i]=='NA' or all_lyst[i]=='':
lyst.append(pcode_lyst[i])
return lyst
def data_load(path):
data = xlrd.open_workbook(path)
table = data.sheets()[0]
ncols = table.ncols
data2 = pd.DataFrame([])
for i in range(ncols):
data2[i] = table.col_values(i)
data2.rename(columns=data2.iloc[0, :], inplace=True)
data2.drop([0], axis=0, inplace=True)
return data2
def in_or_out(lyst,n):
for i in range(len(lyst)):
if n == lyst[i]:
return True
else: return False
def pre(matrix_text, addr):
"""将数据导出
matrix_text:导出数据
addr:导出地址
"""
matrix_text.to_csv(addr, sep=',', index=0, encoding='utf_8_sig', columns=matrix_text.columns)
def xlsx_to_csv_pd(csv_path,xlsx_path):
data_xls = pd.read_excel(csv_path, index_col=0)
data_xls.to_csv(xlsx_path, encoding='utf-8')
def moudle12(old_table,pre_pcode,price_col,pcode_col):
'''
在整个品牌都缺失某项参数,将价格最相近的产品的参数赋予这个空值。
'''
price_col_data = list(old_table.iloc[:, price_col])
price_col_data = list(map(lambda x:float(x), price_col_data)) #字符串转化为浮点型
pcode_col_data = list(old_table.iloc[:, pcode_col])
price_line=int(old_table.iloc[pcode2Line_num(old_table,pre_pcode),price_col])
max_row=0
min_sub=abs(price_col_data[max_row]-price_line)
for i in range(1,len(price_col_data)):
sub = abs(price_col_data[i]-price_line)
if sub < min_sub:
min_sub = sub
max_row = i
fit_pcode = pcode_col_data[max_row]
return fit_pcode
def na(data_matrix, h):
"""随机赋空h个值"""
import numpy as np
data_nan = data_matrix.copy()
xx = np.random.randint(data_matrix.shape[0], size=h)
yy = np.random.randint(data_matrix.shape[1], size=h)
for i in range(h):
data_nan.iloc[xx[i], yy[i]] = 0
return data_nan, xx, yy
def if_none(table,):
lyst=list(table.iloc[:, 0])
if len(lyst) == 0:
return True
def data_add(table,pcode,col,pcode_col,brand_col,price_col,type_col,ser_col):
table_new=table
table_new=moudle2(table,table_new,pcode,brand_col) #原始表格序号从0开始排。
if if_none(table_new):
return table
table_new = table_new.reset_index(drop=True) #reset后序号从0开始排。
table_new=moudle1(table_new,col)
if if_none(table_new):
'''
此处添加价格判断函数,寻找所有品牌价格最相近行。
'''
if col == ser_col:
table=moudle10(table,pcode,ser_col,type_col)
else:
match_max_pcode = moudle12(table,pcode,price_col,pcode_col)
table=moudle9(table,match_max_pcode,pcode,col)
return table
table_new = table_new.reset_index(drop=True) #reset
p=0.5
table_new=moudle5(table,table_new,p,pcode,price_col)
if if_none(table_new):
return table
table_new = table_new.reset_index(drop=True) #reset
match_max_pcode=moudle6(table,table_new,pcode,type_col,pcode_col)
table=moudle9(table,match_max_pcode,pcode,col)
return table
\ No newline at end of file
from data_add_class import Data_add
def main():
'''
table_path = input("请输入excel文件地址:")
out_path = input("请输入excel文件输出地址:")
pcode_col = input("请输出 productcode 在第几列:")
price_col = input("请输入价格在第几列:")
brand_col = input("请输入品牌在第几列:")
type_col = input("请输入型号在第几列:")
ser_col = input("请输入系列号在第几列:")
data_col = input("请输入一列自订的不需要填补列的列号:")
'''
table_path = ("lib_nonstand-stand_price.xlsx")
out_path = ("lib_nonstand-stand_new.csv")
data_col=int('3') #输入文本信息,可以为网页url,也可以为其他不需要预测的信息列。
pcode_col=int("2") #输入productcode列号或者sku列号。
price_col=int("47") #输入价格所在列号。
brand_col=int("4") #输入品牌所在列号
type_col=int("38") #输入型号所在列号
ser_col=int("44") #输入系列所在列号
a = Data_add(table_path,out_path,pcode_col,brand_col,price_col,type_col,ser_col,data_col)
a.data_add_main()
if __name__ == "__main__":
main()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment