Commit 4521e64e authored by Jialin's avatar Jialin

真·代码最终修改

parent cac7d04a
...@@ -6,13 +6,20 @@ import pandas as pd ...@@ -6,13 +6,20 @@ import pandas as pd
import re import re
import xlsxwriter import xlsxwriter
import numpy as np import numpy as np
import pymssql
def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0): def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# filepath:文件路径,thre为两个品牌下型号重合率阈值,inner_thre为两个品牌下某条型号内关键词重合率阈值,a为权重调整,sheet_name为表单名 # filepath:文件路径,thre为两个品牌下型号重合率阈值,inner_thre为两个品牌下某条型号内关键词重合率阈值,a为权重调整,sheet_name为表单名
df = pd.read_excel(filepath, sheet_name=sheet_name, converters = {'产品编码':str}) df = pd.read_excel(filepath, sheet_name=sheet_name, converters = {'产品编码':str})
df_null = pd.read_excel(".\\异常数据表格.xlsx") conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
invalid_list = df_null['异常数据名称'].values database='ZI_NEW', autocommit=True)
cursor_zi_new = conn_zi_new.cursor()
cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='通用' and col_name='异常数据名称'")
invalid_list_fetch = cursor_zi_new.fetchall()
invalid_list = []
for invalid_tuple in invalid_list_fetch:
invalid_list.append(invalid_tuple[0])
# 处理缺失值 # 处理缺失值
valid_index=[] valid_index=[]
for i in df.index: for i in df.index:
......
...@@ -13,23 +13,41 @@ import pymssql ...@@ -13,23 +13,41 @@ import pymssql
import time import time
# category为产品类型 # category为产品类型
# a是同一品牌或父品牌下产品数量占产品总数量的百分比,作为阈值,a越大,有异常的产品越多;b和a一样,只是用于产品数据类型和参数 # b是阈值,用于数据类型,数据格式;比如当某列数据类型占总数据量比例小于b时,该列拥有数据类型的数据索引将被作为异常数据返回
# c_list是产品参数中,数据类型较为统一的参数 在excel列名中的位置,从0开始,必须是一个list
def class_washing(category, filepath, b=0.01): def class_washing(category, filepath, b=0.01):
df_null=pd.read_excel(".\\异常数据表格.xlsx") conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
invalid_list = df_null['异常数据名称'].values database='ZI_NEW', autocommit=True)
cursor_zi_new = conn_zi_new.cursor()
# 一些数据写着‘暂无数据’,‘无参数,需补充’等替代值,这些替代值从外部表格被提取到invalid_list中
cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='通用' and col_name='异常数据名称'")
invalid_list_fetch=cursor_zi_new.fetchall()
invalid_list=[]
for invalid_tuple in invalid_list_fetch:
invalid_list.append(invalid_tuple[0])
df = pd.read_excel(filepath,converters = {'产品编码':str}) df = pd.read_excel(filepath,converters = {'产品编码':str})
df.drop(columns='Unnamed: 0', axis=1, inplace=True) df.drop(columns='Unnamed: 0', axis=1, inplace=True)
# 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index.(可优化成字典形式存储读取) # 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index. 数据类型的例子:string, float, int
t1=time.time() t1=time.time()
print('开始检测数据类型') print('开始检测数据类型')
dtype_minor_dict = {}
for col in df_null['数据类型异常'][df_null['类别']==category][df_null['数据类型异常'].notnull()].values: dtype_minor_dict = {} # 该字典键为异常数据索引,值为异常数据类型出现的列名,值是字符串而非列表,多个列名由空格隔开
type_list = {} # #从外部表提取用于判断‘数据类型异常’的列,放入dtype_col_list
valid_index = [] cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='{category}' and col_name='数据类型异常'")
dtype_col_fetch=cursor_zi_new.fetchall()
if not dtype_col_fetch:
print('该类别无‘数据类型异常’列名,或者类别名输入有误,请查证数据库')
return
dtype_col_list=[]
for dtype_tuple in dtype_col_fetch:
dtype_col_list.append(dtype_tuple[0])
# #按列找
for col in dtype_col_list:
type_list = {} # 键为该列出现的数据类型,值为该类型出现的次数
valid_index = [] # 值为‘暂无数据’等替代值的数据被忽略,值为实际值的数据索引提取到valid_index
for i in df.index: for i in df.index:
if df.loc[i,col] in invalid_list: if df.loc[i,col] in invalid_list:
continue continue
...@@ -41,6 +59,7 @@ def class_washing(category, filepath, b=0.01): ...@@ -41,6 +59,7 @@ def class_washing(category, filepath, b=0.01):
type_list[data_type] += 1 type_list[data_type] += 1
for data_type_i in type_list: for data_type_i in type_list:
if type_list[data_type_i] < len(valid_index) * b: if type_list[data_type_i] < len(valid_index) * b:
# 下面这一段可以优化,可以在type_list字典中存储每个数据类型对应的数值索引,先记下,优化阶段再改
for i in valid_index: for i in valid_index:
if type(df.loc[i][col]) == data_type_i: if type(df.loc[i][col]) == data_type_i:
if i in dtype_minor_dict.keys(): if i in dtype_minor_dict.keys():
...@@ -48,10 +67,11 @@ def class_washing(category, filepath, b=0.01): ...@@ -48,10 +67,11 @@ def class_washing(category, filepath, b=0.01):
elif i not in dtype_minor_dict.keys(): elif i not in dtype_minor_dict.keys():
dtype_minor_dict[i] = col dtype_minor_dict[i] = col
# #将dtype_minor_dict字典的键,也就是异常数据索引,放入dtype_minor
dtype_minor=[] dtype_minor=[]
dtype_minor.extend(dtype_minor_dict.keys()) dtype_minor.extend(dtype_minor_dict.keys())
# 在检测完产品数据类型后,将所有数据类型转换为string # 在检测完产品数据类型后,将所有数据类型转换为string,因为后面的操作都是对于字符串的,如果有数据类型不是字符串就比较麻烦
for col in df.columns: for col in df.columns:
df[col] = df[col].astype(str) df[col] = df[col].astype(str)
...@@ -59,21 +79,20 @@ def class_washing(category, filepath, b=0.01): ...@@ -59,21 +79,20 @@ def class_washing(category, filepath, b=0.01):
t2=time.time() t2=time.time()
print(t2-t1) print(t2-t1)
print('开始检测错误类别和错误名称') print('开始检测错误类别和错误名称')
wrong_class = [] wrong_class = [] # 类型错误数据索引
not_in_name = [] not_in_name = [] # 名称中不带类型的数据索引
for i in df.index: for i in df.index:
if df.loc[i, '产品类别'] != category: if df.loc[i, '产品类别'] != category:
wrong_class.append(i) wrong_class.append(i)
if category not in df.loc[i, '产品名称']: if category not in df.loc[i, '产品名称']:
not_in_name.append(i) not_in_name.append(i)
# 检测品牌中是否有不在category下对应的brand_id的产品品牌 # 检测品牌中有哪些不在数据表中该类别下属品牌中
t3=time.time() t3=time.time()
print(t3-t2) print(t3-t2)
print('开始检测错误品牌') print('开始检测错误品牌')
conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
database='ZI_NEW', autocommit=True) # #提取类别id
cursor_zi_new = conn_zi_new.cursor()
cursor_zi_new.execute(f"select id from p_category where name='{category}'") cursor_zi_new.execute(f"select id from p_category where name='{category}'")
category_id=cursor_zi_new.fetchone() category_id=cursor_zi_new.fetchone()
if not category_id: if not category_id:
...@@ -81,12 +100,14 @@ def class_washing(category, filepath, b=0.01): ...@@ -81,12 +100,14 @@ def class_washing(category, filepath, b=0.01):
return return
category_id = category_id[0] category_id = category_id[0]
# #找到该类别下的品牌的brand_id
cursor_zi_new.execute(f"select brandid from p_spu where categoryid={category_id}") cursor_zi_new.execute(f"select brandid from p_spu where categoryid={category_id}")
brand_id_fetchall=cursor_zi_new.fetchall() brand_id_fetchall=cursor_zi_new.fetchall()
brand_id_list = [] brand_id_list = []
for brand_tuple in brand_id_fetchall: for brand_tuple in brand_id_fetchall:
brand_id_list.append(brand_tuple[0]) brand_id_list.append(brand_tuple[0])
# #从brand_id找到品牌名(为什么不直接从category_id找品牌名呢?问得好,没有这个表)
brand_name_list = [] brand_name_list = []
for brand_id in brand_id_list: for brand_id in brand_id_list:
cursor_zi_new.execute(f"select name from p_brand where id={brand_id}") cursor_zi_new.execute(f"select name from p_brand where id={brand_id}")
...@@ -94,84 +115,48 @@ def class_washing(category, filepath, b=0.01): ...@@ -94,84 +115,48 @@ def class_washing(category, filepath, b=0.01):
if brand_name_fetch: if brand_name_fetch:
brand_name_list.append(brand_name_fetch[0].strip("'")) brand_name_list.append(brand_name_fetch[0].strip("'"))
# #如果哪条数据品牌没在品牌列表中,作为异常值返回
wrong_brand=[] wrong_brand=[]
for i in df.index: for i in df.index:
if df.loc[i]['产品品牌'] not in brand_name_list: if df.loc[i]['产品品牌'] not in brand_name_list:
wrong_brand.append(i) wrong_brand.append(i)
# cursor_zi_new.execute(f"select id from p_brand where name='{df.loc[i]['产品品牌']}'")
# brand_id=cursor_zi_new.fetchone() # 检测产品参数列中数据长度离该列平均数据长度2*std之外的产品
# if not brand_id:
# wrong_brand.append(i)
# continue
# brand_id=brand_id[0]
# if brand_id not in brand_id_list:
# wrong_brand.append(i)
# # 检测产品父品牌中品牌出现次数小于产品总数的a的产品
# father_brand_minor = []
# father_brand_list = []
# col='产品父品牌'
# valid_index=[]
# for i in df.index:
# if df.loc[i, col] in invalid_list:
# continue
# valid_index.append(i)
# valid_df=df.loc[valid_index]
# father_brand_num = valid_df.groupby(by='产品父品牌')['产品编码'].count() # 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
# father_num_list = [x for x in father_brand_num.unique() if x < len(valid_df.index)*a]
# for i in father_brand_num.index: # i 就是产品父品牌
# if father_brand_num.loc[i] in father_num_list: # father_brand_num.loc[i] 就是该父品牌出现次数
# father_brand_list.append(i)
#
# for i in valid_df.index:
# if valid_df.loc[i, '产品父品牌'] in father_brand_list:
# father_brand_minor.append(i)
#
# # 检测产品品牌中品牌出现次数少的产品
# brand_minor = []
# brand_list = []
# col = '产品品牌'
# valid_index = []
# for i in df.index:
# if df.loc[i, col] in invalid_list:
# continue
# valid_index.append(i)
# valid_df=df.loc[valid_index]
# brand_num = valid_df.groupby(by='产品品牌')['产品编码'].count() # 同上
# num_list = [x for x in brand_num.unique() if x < len(valid_df.index)*a]
# for i in brand_num.index:
# if brand_num.loc[i] in num_list:
# brand_list.append(i)
#
# for i in valid_df.index:
# if valid_df.loc[i, '产品品牌'] in brand_list:
# brand_minor.append(i)
# 检测产品参数列中数据长度离该列平均数据长度2*std之外的产品,7是第一个产品参数列,-2是质保时间,-1是产品型号
t4=time.time() t4=time.time()
print(t4-t3) print(t4-t3)
print('开始检测错误长度') print('开始检测错误长度')
length_minor_dict = {} length_minor_dict = {}
for col in df_null['数据长度异常'][df_null['类别']==category][df_null['数据长度异常'].notnull()].values: # #从外部表找到用于数据长度判断的列名
col_length = [] cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='{category}' and col_name='数据长度异常'")
valid_index=[] length_col_fetch = cursor_zi_new.fetchall()
if not length_col_fetch:
print('该类别无‘数据长度异常’列名,或者类别名输入有误,请查证数据库')
return
length_col_list = []
for length_tuple in length_col_fetch:
length_col_list.append(length_tuple[0])
# #按列找
for col in length_col_list:
col_length = [] # 包含该列每条数据的长度
valid_index=[] # 同上
for i in df.index: for i in df.index:
if df.loc[i, col] in invalid_list: if df.loc[i, col] in invalid_list:
continue continue
valid_index.append(i) valid_index.append(i)
col_length.append(len(df.loc[i, col])) col_length.append(len(df.loc[i, col]))
if col_length: if col_length: # 有时候一列数据全都是无效数据所以加个条件
std = np.array(col_length).std() std = np.array(col_length).std()
mean = np.array(col_length).mean() mean = np.array(col_length).mean()
for counter, length in enumerate(col_length): for counter, length in enumerate(col_length):
if length < mean - 2 * std or length > mean + 2 * std: if length < mean - 2 * std or length > mean + 2 * std:
# length_minor_dict[valid_index[counter]]=col index=valid_index[counter] # valid_index和col_length是一一对应的
index=valid_index[counter]
if index in length_minor_dict.keys(): if index in length_minor_dict.keys():
length_minor_dict[index] += ' ' + col length_minor_dict[index] += ' ' + col
elif index not in length_minor_dict.keys(): elif index not in length_minor_dict.keys():
length_minor_dict[index] = col length_minor_dict[index] = col
length_minor=[] length_minor=[]
length_minor.extend(length_minor_dict.keys()) length_minor.extend(length_minor_dict.keys())
...@@ -179,8 +164,21 @@ def class_washing(category, filepath, b=0.01): ...@@ -179,8 +164,21 @@ def class_washing(category, filepath, b=0.01):
t5=time.time() t5=time.time()
print(t5-t4) print(t5-t4)
print('开始检测错误数据格式') print('开始检测错误数据格式')
# #从外部表中提取用于数据格式判断的列名
format_minor_dict = {} format_minor_dict = {}
for col in df_null['数据格式异常'][df_null['类别']==category][df_null['数据格式异常'].notnull()].values: cursor_zi_new.execute(
f"select col_value from data_washing_external where category_name='{category}' and col_name='数据格式异常'")
format_col_fetch = cursor_zi_new.fetchall()
if not format_col_fetch:
print('该类别无‘数据格式异常’列名,或者类别名输入有误,请查证数据库')
return
format_col_list = []
for format_tuple in format_col_fetch:
format_col_list.append(format_tuple[0])
# #按列找
for col in format_col_list:
counter_dict = {} counter_dict = {}
valid_index = [] valid_index = []
for i in df.index: for i in df.index:
...@@ -218,19 +216,17 @@ def class_washing(category, filepath, b=0.01): ...@@ -218,19 +216,17 @@ def class_washing(category, filepath, b=0.01):
format_minor=[] format_minor=[]
format_minor.extend(format_minor_dict.keys()) format_minor.extend(format_minor_dict.keys())
# length_record = []
# for keys in counter_dict: # 对于产品名称的第二次筛选,比如在扫描仪中,对于产品名称中没有“扫描仪”的,如果没有“高拍仪”就挑出来
# if not length_record: cursor_zi_new.execute(
# length_record.append([len(counter_dict[keys]), counter_dict[keys]]) f"select col_value from data_washing_external where category_name='{category}' and col_name='产品名称异常'")
# elif len(counter_dict[keys]) < length_record[0][0]: name_col_fetch = cursor_zi_new.fetchall()
# length_record[0] = [len(counter_dict[keys]), counter_dict[keys]] name_col_list = []
# for name_tuple in name_col_fetch:
# format_minor += length_record[0][1] name_col_list.append(name_tuple[0])
# 对于产品名称中没有“扫描仪”的,如果没有“高拍仪”就挑出来
not_in_name2 = [] not_in_name2 = []
for i in not_in_name: for i in not_in_name:
for special_name in df_null['产品名称异常'][df_null['类别']==category][df_null['产品名称异常'].notnull()].values: for special_name in name_col_list:
if special_name in df.loc[i,'产品名称']: if special_name in df.loc[i,'产品名称']:
break break
not_in_name2.append(i) not_in_name2.append(i)
...@@ -239,8 +235,18 @@ def class_washing(category, filepath, b=0.01): ...@@ -239,8 +235,18 @@ def class_washing(category, filepath, b=0.01):
t6=time.time() t6=time.time()
print(t6-t5) print(t6-t5)
print('开始检测标准参数') print('开始检测标准参数')
cursor_zi_new.execute(
f"select col_value from data_washing_external where category_name='{category}' and col_name='标准参数异常'")
stdparam_col_fetch = cursor_zi_new.fetchall()
if not stdparam_col_fetch:
print('该类别无‘标准参数异常’列名,或者类别名输入有误,请查证数据库。代码继续运行')
stdparam_col_list = []
for stdparam_tuple in stdparam_col_fetch:
stdparam_col_list.append(stdparam_tuple[0])
character_minor_dict = {} character_minor_dict = {}
for col_i in df_null['标准参数异常'][df_null['类别']==category][df_null['标准参数异常'].notnull()].values: for col_i in stdparam_col_list:
temp_list = [] temp_list = []
cursor_zi_new.execute(f"select stdvalue from ShuJuZiDian_Cfg where categoryname='{category}' and subtitle='{col_i.strip('*')}'") cursor_zi_new.execute(f"select stdvalue from ShuJuZiDian_Cfg where categoryname='{category}' and subtitle='{col_i.strip('*')}'")
standard_value_fetchall=cursor_zi_new.fetchall() standard_value_fetchall=cursor_zi_new.fetchall()
...@@ -265,24 +271,6 @@ def class_washing(category, filepath, b=0.01): ...@@ -265,24 +271,6 @@ def class_washing(category, filepath, b=0.01):
for keys_i in character_minor_dict: for keys_i in character_minor_dict:
character_minor.extend(character_minor_dict[keys_i]) character_minor.extend(character_minor_dict[keys_i])
# cha_num = valid_df.groupby(by=col_i)['产品编码'].count()
# num_list = [x for x in cha_num.unique() if x < len(valid_df.index)*b]
# for i in cha_num.index:
# if cha_num.loc[i] in num_list:
# tempo_list.append(i)
# for i in valid_df.index:
# if valid_df.loc[i, col_i] in tempo_list:
# tempo_list2.append(i)
# character_minor_dict[col_i] = tempo_list2
#
# character_minor = []
# for keys_i in character_minor_dict:
# character_minor.extend(character_minor_dict[keys_i])
t7=time.time() t7=time.time()
print(t7-t6) print(t7-t6)
print('开始整合数据') print('开始整合数据')
...@@ -292,7 +280,6 @@ def class_washing(category, filepath, b=0.01): ...@@ -292,7 +280,6 @@ def class_washing(category, filepath, b=0.01):
index_minor.extend(format_minor) index_minor.extend(format_minor)
index_minor.extend(length_minor) index_minor.extend(length_minor)
index_minor.extend(wrong_brand) index_minor.extend(wrong_brand)
# index_minor.extend(father_brand_minor)
index_minor.extend(not_in_name2) index_minor.extend(not_in_name2)
index_minor.extend(dtype_minor) index_minor.extend(dtype_minor)
index_minor.extend(character_minor) index_minor.extend(character_minor)
...@@ -301,13 +288,25 @@ def class_washing(category, filepath, b=0.01): ...@@ -301,13 +288,25 @@ def class_washing(category, filepath, b=0.01):
final_df = pd.DataFrame(np.zeros((len(index_minor), 7)), index=list(index_minor), final_df = pd.DataFrame(np.zeros((len(index_minor), 7)), index=list(index_minor),
columns=['计数', '产品类别异常', '产品名称异常', '品牌异常', '数据类型异常', '数据格式异常', '数据长度异常']) columns=['计数', '产品类别异常', '产品名称异常', '品牌异常', '数据类型异常', '数据格式异常', '数据长度异常'])
w_class = df_null['产品类别异常权重'][df_null['类别']==category][df_null['产品类别异常权重'].notnull()].values weight_list=[]
w_format = df_null['数据格式异常权重'][df_null['类别']==category][df_null['数据格式异常权重'].notnull()].values weight_colname_list=['产品类别异常权重','数据格式异常权重','数据长度异常权重','品牌异常权重','产品名称异常权重','数据类型异常权重','标准参数异常权重']
w_length = df_null['数据长度异常权重'][df_null['类别']==category][df_null['数据长度异常权重'].notnull()].values for col_name in weight_colname_list:
w_brand = df_null['品牌异常权重'][df_null['类别']==category][df_null['品牌异常权重'].notnull()].values cursor_zi_new.execute(
w_name = df_null['产品名称异常权重'][df_null['类别']==category][df_null['产品名称异常权重'].notnull()].values f"select col_value from data_washing_external where category_name='{category}' and col_name='{col_name}'")
w_dtype = df_null['数据类型异常权重'][df_null['类别']==category][df_null['数据类型异常权重'].notnull()].values weight_fetch=cursor_zi_new.fetchone()
w_stdparam = df_null['标准参数异常权重'][df_null['类别']==category][df_null['标准参数异常权重'].notnull()].values if weight_fetch:
weight_list.append(weight_fetch[0])
else:
weight_list.append(1.0)
w_class = float(weight_list[0])
w_format = float(weight_list[1])
w_length = float(weight_list[2])
w_brand = float(weight_list[3])
w_name = float(weight_list[4])
w_dtype = float(weight_list[5])
w_stdparam = float(weight_list[6])
for i in index_minor: for i in index_minor:
count = 0 count = 0
if i in wrong_class: if i in wrong_class:
...@@ -322,9 +321,6 @@ def class_washing(category, filepath, b=0.01): ...@@ -322,9 +321,6 @@ def class_washing(category, filepath, b=0.01):
if i in wrong_brand: if i in wrong_brand:
count += w_brand count += w_brand
final_df.loc[i, '品牌异常'] = 1 final_df.loc[i, '品牌异常'] = 1
# if i in father_brand_minor:
# count += 1
# final_df.loc[i, '父品牌异常'] = 1
if i in not_in_name2: if i in not_in_name2:
count += w_name count += w_name
final_df.loc[i, '产品名称异常'] = 1 final_df.loc[i, '产品名称异常'] = 1
...@@ -347,8 +343,4 @@ def class_washing(category, filepath, b=0.01): ...@@ -347,8 +343,4 @@ def class_washing(category, filepath, b=0.01):
if __name__ == '__main__': if __name__ == '__main__':
category='扫描仪' category='扫描仪'
filepath="E:\\ZDZC\\扫描仪参数确认.xlsx" filepath="E:\\ZDZC\\扫描仪参数确认.xlsx"
#c_list=[6,7,-4,-3]
# category = '扫描仪'
# filepath="E:\\ZDZC\\扫描仪参数确认.xlsx"
# c_list=[7,8,9]
class_washing(category, filepath) class_washing(category, filepath)
...@@ -10,16 +10,29 @@ import pandas as pd ...@@ -10,16 +10,29 @@ import pandas as pd
import re import re
import numpy as np import numpy as np
import xlsxwriter import xlsxwriter
import pymssql
def product_washing(filepath, category,thre=1, a=0):
df_null = pd.read_excel(".\\异常数据表格.xlsx") def product_washing(filepath, category, thre=1, a=0):
invalid_list = df_null['异常数据名称'].values conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
database='ZI_NEW', autocommit=True)
cursor_zi_new = conn_zi_new.cursor()
cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='通用' and col_name='异常数据名称'")
invalid_list_fetch = cursor_zi_new.fetchall()
invalid_list = []
for invalid_tuple in invalid_list_fetch:
invalid_list.append(invalid_tuple[0])
df=pd.read_excel(filepath, converters={'产品编码':str}) df=pd.read_excel(filepath, converters={'产品编码':str})
df.drop(columns='Unnamed: 0', axis=1, inplace=True) df.drop(columns='Unnamed: 0', axis=1, inplace=True)
for col in df.columns: for col in df.columns:
df[col]=df[col].astype(str) df[col]=df[col].astype(str)
cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='{category}' and col_name='重复参数项'")
other_parameters_fetch=cursor_zi_new.fetchall()
other_parameters=[]
for param in other_parameters_fetch:
other_parameters.append(param[0])
related_product = [] related_product = []
brand_grouped = df.groupby(by='产品品牌') brand_grouped = df.groupby(by='产品品牌')
for brand in brand_grouped: for brand in brand_grouped:
...@@ -86,7 +99,7 @@ def product_washing(filepath, category,thre=1, a=0): ...@@ -86,7 +99,7 @@ def product_washing(filepath, category,thre=1, a=0):
# brand_combined = temp_list1+temp_list2 # brand_combined = temp_list1+temp_list2
tempo_dict[i] = [set(combined)] tempo_dict[i] = [set(combined)]
other_parameters=df_null['重复参数项'][df_null['类别']==category][df_null['重复参数项'].notnull()].values
other_parameters_values=[] other_parameters_values=[]
for parameter in other_parameters: for parameter in other_parameters:
other_parameters_values.append(brand[1].loc[i,parameter]) other_parameters_values.append(brand[1].loc[i,parameter])
......
...@@ -162,5 +162,5 @@ def get_point_category_params_data(category): ...@@ -162,5 +162,5 @@ def get_point_category_params_data(category):
conn_zi_new.close() conn_zi_new.close()
category = '扫描仪' category = '激光打印机'
get_point_category_params_data(category) get_point_category_params_data(category)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment