Commit 4521e64e authored by Jialin's avatar Jialin

真·代码最终修改

parent cac7d04a
......@@ -6,13 +6,20 @@ import pandas as pd
import re
import xlsxwriter
import numpy as np
import pymssql
def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# filepath:文件路径,thre为两个品牌下型号重合率阈值,inner_thre为两个品牌下某条型号内关键词重合率阈值,a为权重调整,sheet_name为表单名
df = pd.read_excel(filepath, sheet_name=sheet_name, converters = {'产品编码':str})
df_null = pd.read_excel(".\\异常数据表格.xlsx")
invalid_list = df_null['异常数据名称'].values
conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
database='ZI_NEW', autocommit=True)
cursor_zi_new = conn_zi_new.cursor()
cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='通用' and col_name='异常数据名称'")
invalid_list_fetch = cursor_zi_new.fetchall()
invalid_list = []
for invalid_tuple in invalid_list_fetch:
invalid_list.append(invalid_tuple[0])
# 处理缺失值
valid_index=[]
for i in df.index:
......
......@@ -13,23 +13,41 @@ import pymssql
import time
# category为产品类型
# a是同一品牌或父品牌下产品数量占产品总数量的百分比,作为阈值,a越大,有异常的产品越多;b和a一样,只是用于产品数据类型和参数
# c_list是产品参数中,数据类型较为统一的参数 在excel列名中的位置,从0开始,必须是一个list
# b是阈值,用于数据类型,数据格式;比如当某列数据类型占总数据量比例小于b时,该列拥有数据类型的数据索引将被作为异常数据返回
def class_washing(category, filepath, b=0.01):
df_null=pd.read_excel(".\\异常数据表格.xlsx")
invalid_list = df_null['异常数据名称'].values
conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
database='ZI_NEW', autocommit=True)
cursor_zi_new = conn_zi_new.cursor()
# 一些数据写着‘暂无数据’,‘无参数,需补充’等替代值,这些替代值从外部表格被提取到invalid_list中
cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='通用' and col_name='异常数据名称'")
invalid_list_fetch=cursor_zi_new.fetchall()
invalid_list=[]
for invalid_tuple in invalid_list_fetch:
invalid_list.append(invalid_tuple[0])
df = pd.read_excel(filepath,converters = {'产品编码':str})
df.drop(columns='Unnamed: 0', axis=1, inplace=True)
# 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index.(可优化成字典形式存储读取)
# 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index. 数据类型的例子:string, float, int
t1=time.time()
print('开始检测数据类型')
dtype_minor_dict = {}
for col in df_null['数据类型异常'][df_null['类别']==category][df_null['数据类型异常'].notnull()].values:
type_list = {}
valid_index = []
dtype_minor_dict = {} # 该字典键为异常数据索引,值为异常数据类型出现的列名,值是字符串而非列表,多个列名由空格隔开
# #从外部表提取用于判断‘数据类型异常’的列,放入dtype_col_list
cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='{category}' and col_name='数据类型异常'")
dtype_col_fetch=cursor_zi_new.fetchall()
if not dtype_col_fetch:
print('该类别无‘数据类型异常’列名,或者类别名输入有误,请查证数据库')
return
dtype_col_list=[]
for dtype_tuple in dtype_col_fetch:
dtype_col_list.append(dtype_tuple[0])
# #按列找
for col in dtype_col_list:
type_list = {} # 键为该列出现的数据类型,值为该类型出现的次数
valid_index = [] # 值为‘暂无数据’等替代值的数据被忽略,值为实际值的数据索引提取到valid_index
for i in df.index:
if df.loc[i,col] in invalid_list:
continue
......@@ -41,6 +59,7 @@ def class_washing(category, filepath, b=0.01):
type_list[data_type] += 1
for data_type_i in type_list:
if type_list[data_type_i] < len(valid_index) * b:
# 下面这一段可以优化,可以在type_list字典中存储每个数据类型对应的数值索引,先记下,优化阶段再改
for i in valid_index:
if type(df.loc[i][col]) == data_type_i:
if i in dtype_minor_dict.keys():
......@@ -48,10 +67,11 @@ def class_washing(category, filepath, b=0.01):
elif i not in dtype_minor_dict.keys():
dtype_minor_dict[i] = col
# #将dtype_minor_dict字典的键,也就是异常数据索引,放入dtype_minor
dtype_minor=[]
dtype_minor.extend(dtype_minor_dict.keys())
# 在检测完产品数据类型后,将所有数据类型转换为string
# 在检测完产品数据类型后,将所有数据类型转换为string,因为后面的操作都是对于字符串的,如果有数据类型不是字符串就比较麻烦
for col in df.columns:
df[col] = df[col].astype(str)
......@@ -59,21 +79,20 @@ def class_washing(category, filepath, b=0.01):
t2=time.time()
print(t2-t1)
print('开始检测错误类别和错误名称')
wrong_class = []
not_in_name = []
wrong_class = [] # 类型错误数据索引
not_in_name = [] # 名称中不带类型的数据索引
for i in df.index:
if df.loc[i, '产品类别'] != category:
wrong_class.append(i)
if category not in df.loc[i, '产品名称']:
not_in_name.append(i)
# 检测品牌中是否有不在category下对应的brand_id的产品品牌
# 检测品牌中有哪些不在数据表中该类别下属品牌中
t3=time.time()
print(t3-t2)
print('开始检测错误品牌')
conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
database='ZI_NEW', autocommit=True)
cursor_zi_new = conn_zi_new.cursor()
# #提取类别id
cursor_zi_new.execute(f"select id from p_category where name='{category}'")
category_id=cursor_zi_new.fetchone()
if not category_id:
......@@ -81,12 +100,14 @@ def class_washing(category, filepath, b=0.01):
return
category_id = category_id[0]
# #找到该类别下的品牌的brand_id
cursor_zi_new.execute(f"select brandid from p_spu where categoryid={category_id}")
brand_id_fetchall=cursor_zi_new.fetchall()
brand_id_list = []
for brand_tuple in brand_id_fetchall:
brand_id_list.append(brand_tuple[0])
# #从brand_id找到品牌名(为什么不直接从category_id找品牌名呢?问得好,没有这个表)
brand_name_list = []
for brand_id in brand_id_list:
cursor_zi_new.execute(f"select name from p_brand where id={brand_id}")
......@@ -94,84 +115,48 @@ def class_washing(category, filepath, b=0.01):
if brand_name_fetch:
brand_name_list.append(brand_name_fetch[0].strip("'"))
# #如果哪条数据品牌没在品牌列表中,作为异常值返回
wrong_brand=[]
for i in df.index:
if df.loc[i]['产品品牌'] not in brand_name_list:
wrong_brand.append(i)
# cursor_zi_new.execute(f"select id from p_brand where name='{df.loc[i]['产品品牌']}'")
# brand_id=cursor_zi_new.fetchone()
# if not brand_id:
# wrong_brand.append(i)
# continue
# brand_id=brand_id[0]
# if brand_id not in brand_id_list:
# wrong_brand.append(i)
# # 检测产品父品牌中品牌出现次数小于产品总数的a的产品
# father_brand_minor = []
# father_brand_list = []
# col='产品父品牌'
# valid_index=[]
# for i in df.index:
# if df.loc[i, col] in invalid_list:
# continue
# valid_index.append(i)
# valid_df=df.loc[valid_index]
# father_brand_num = valid_df.groupby(by='产品父品牌')['产品编码'].count() # 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
# father_num_list = [x for x in father_brand_num.unique() if x < len(valid_df.index)*a]
# for i in father_brand_num.index: # i 就是产品父品牌
# if father_brand_num.loc[i] in father_num_list: # father_brand_num.loc[i] 就是该父品牌出现次数
# father_brand_list.append(i)
#
# for i in valid_df.index:
# if valid_df.loc[i, '产品父品牌'] in father_brand_list:
# father_brand_minor.append(i)
#
# # 检测产品品牌中品牌出现次数少的产品
# brand_minor = []
# brand_list = []
# col = '产品品牌'
# valid_index = []
# for i in df.index:
# if df.loc[i, col] in invalid_list:
# continue
# valid_index.append(i)
# valid_df=df.loc[valid_index]
# brand_num = valid_df.groupby(by='产品品牌')['产品编码'].count() # 同上
# num_list = [x for x in brand_num.unique() if x < len(valid_df.index)*a]
# for i in brand_num.index:
# if brand_num.loc[i] in num_list:
# brand_list.append(i)
#
# for i in valid_df.index:
# if valid_df.loc[i, '产品品牌'] in brand_list:
# brand_minor.append(i)
# 检测产品参数列中数据长度离该列平均数据长度2*std之外的产品,7是第一个产品参数列,-2是质保时间,-1是产品型号
# 检测产品参数列中数据长度离该列平均数据长度2*std之外的产品
t4=time.time()
print(t4-t3)
print('开始检测错误长度')
length_minor_dict = {}
for col in df_null['数据长度异常'][df_null['类别']==category][df_null['数据长度异常'].notnull()].values:
col_length = []
valid_index=[]
# #从外部表找到用于数据长度判断的列名
cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='{category}' and col_name='数据长度异常'")
length_col_fetch = cursor_zi_new.fetchall()
if not length_col_fetch:
print('该类别无‘数据长度异常’列名,或者类别名输入有误,请查证数据库')
return
length_col_list = []
for length_tuple in length_col_fetch:
length_col_list.append(length_tuple[0])
# #按列找
for col in length_col_list:
col_length = [] # 包含该列每条数据的长度
valid_index=[] # 同上
for i in df.index:
if df.loc[i, col] in invalid_list:
continue
valid_index.append(i)
col_length.append(len(df.loc[i, col]))
if col_length:
if col_length: # 有时候一列数据全都是无效数据所以加个条件
std = np.array(col_length).std()
mean = np.array(col_length).mean()
for counter, length in enumerate(col_length):
if length < mean - 2 * std or length > mean + 2 * std:
# length_minor_dict[valid_index[counter]]=col
index=valid_index[counter]
index=valid_index[counter] # valid_index和col_length是一一对应的
if index in length_minor_dict.keys():
length_minor_dict[index] += ' ' + col
elif index not in length_minor_dict.keys():
length_minor_dict[index] = col
length_minor=[]
length_minor.extend(length_minor_dict.keys())
......@@ -179,8 +164,21 @@ def class_washing(category, filepath, b=0.01):
t5=time.time()
print(t5-t4)
print('开始检测错误数据格式')
# #从外部表中提取用于数据格式判断的列名
format_minor_dict = {}
for col in df_null['数据格式异常'][df_null['类别']==category][df_null['数据格式异常'].notnull()].values:
cursor_zi_new.execute(
f"select col_value from data_washing_external where category_name='{category}' and col_name='数据格式异常'")
format_col_fetch = cursor_zi_new.fetchall()
if not format_col_fetch:
print('该类别无‘数据格式异常’列名,或者类别名输入有误,请查证数据库')
return
format_col_list = []
for format_tuple in format_col_fetch:
format_col_list.append(format_tuple[0])
# #按列找
for col in format_col_list:
counter_dict = {}
valid_index = []
for i in df.index:
......@@ -218,19 +216,17 @@ def class_washing(category, filepath, b=0.01):
format_minor=[]
format_minor.extend(format_minor_dict.keys())
# length_record = []
# for keys in counter_dict:
# if not length_record:
# length_record.append([len(counter_dict[keys]), counter_dict[keys]])
# elif len(counter_dict[keys]) < length_record[0][0]:
# length_record[0] = [len(counter_dict[keys]), counter_dict[keys]]
#
# format_minor += length_record[0][1]
# 对于产品名称中没有“扫描仪”的,如果没有“高拍仪”就挑出来
# 对于产品名称的第二次筛选,比如在扫描仪中,对于产品名称中没有“扫描仪”的,如果没有“高拍仪”就挑出来
cursor_zi_new.execute(
f"select col_value from data_washing_external where category_name='{category}' and col_name='产品名称异常'")
name_col_fetch = cursor_zi_new.fetchall()
name_col_list = []
for name_tuple in name_col_fetch:
name_col_list.append(name_tuple[0])
not_in_name2 = []
for i in not_in_name:
for special_name in df_null['产品名称异常'][df_null['类别']==category][df_null['产品名称异常'].notnull()].values:
for special_name in name_col_list:
if special_name in df.loc[i,'产品名称']:
break
not_in_name2.append(i)
......@@ -239,8 +235,18 @@ def class_washing(category, filepath, b=0.01):
t6=time.time()
print(t6-t5)
print('开始检测标准参数')
cursor_zi_new.execute(
f"select col_value from data_washing_external where category_name='{category}' and col_name='标准参数异常'")
stdparam_col_fetch = cursor_zi_new.fetchall()
if not stdparam_col_fetch:
print('该类别无‘标准参数异常’列名,或者类别名输入有误,请查证数据库。代码继续运行')
stdparam_col_list = []
for stdparam_tuple in stdparam_col_fetch:
stdparam_col_list.append(stdparam_tuple[0])
character_minor_dict = {}
for col_i in df_null['标准参数异常'][df_null['类别']==category][df_null['标准参数异常'].notnull()].values:
for col_i in stdparam_col_list:
temp_list = []
cursor_zi_new.execute(f"select stdvalue from ShuJuZiDian_Cfg where categoryname='{category}' and subtitle='{col_i.strip('*')}'")
standard_value_fetchall=cursor_zi_new.fetchall()
......@@ -265,24 +271,6 @@ def class_washing(category, filepath, b=0.01):
for keys_i in character_minor_dict:
character_minor.extend(character_minor_dict[keys_i])
# cha_num = valid_df.groupby(by=col_i)['产品编码'].count()
# num_list = [x for x in cha_num.unique() if x < len(valid_df.index)*b]
# for i in cha_num.index:
# if cha_num.loc[i] in num_list:
# tempo_list.append(i)
# for i in valid_df.index:
# if valid_df.loc[i, col_i] in tempo_list:
# tempo_list2.append(i)
# character_minor_dict[col_i] = tempo_list2
#
# character_minor = []
# for keys_i in character_minor_dict:
# character_minor.extend(character_minor_dict[keys_i])
t7=time.time()
print(t7-t6)
print('开始整合数据')
......@@ -292,7 +280,6 @@ def class_washing(category, filepath, b=0.01):
index_minor.extend(format_minor)
index_minor.extend(length_minor)
index_minor.extend(wrong_brand)
# index_minor.extend(father_brand_minor)
index_minor.extend(not_in_name2)
index_minor.extend(dtype_minor)
index_minor.extend(character_minor)
......@@ -301,13 +288,25 @@ def class_washing(category, filepath, b=0.01):
final_df = pd.DataFrame(np.zeros((len(index_minor), 7)), index=list(index_minor),
columns=['计数', '产品类别异常', '产品名称异常', '品牌异常', '数据类型异常', '数据格式异常', '数据长度异常'])
w_class = df_null['产品类别异常权重'][df_null['类别']==category][df_null['产品类别异常权重'].notnull()].values
w_format = df_null['数据格式异常权重'][df_null['类别']==category][df_null['数据格式异常权重'].notnull()].values
w_length = df_null['数据长度异常权重'][df_null['类别']==category][df_null['数据长度异常权重'].notnull()].values
w_brand = df_null['品牌异常权重'][df_null['类别']==category][df_null['品牌异常权重'].notnull()].values
w_name = df_null['产品名称异常权重'][df_null['类别']==category][df_null['产品名称异常权重'].notnull()].values
w_dtype = df_null['数据类型异常权重'][df_null['类别']==category][df_null['数据类型异常权重'].notnull()].values
w_stdparam = df_null['标准参数异常权重'][df_null['类别']==category][df_null['标准参数异常权重'].notnull()].values
weight_list=[]
weight_colname_list=['产品类别异常权重','数据格式异常权重','数据长度异常权重','品牌异常权重','产品名称异常权重','数据类型异常权重','标准参数异常权重']
for col_name in weight_colname_list:
cursor_zi_new.execute(
f"select col_value from data_washing_external where category_name='{category}' and col_name='{col_name}'")
weight_fetch=cursor_zi_new.fetchone()
if weight_fetch:
weight_list.append(weight_fetch[0])
else:
weight_list.append(1.0)
w_class = float(weight_list[0])
w_format = float(weight_list[1])
w_length = float(weight_list[2])
w_brand = float(weight_list[3])
w_name = float(weight_list[4])
w_dtype = float(weight_list[5])
w_stdparam = float(weight_list[6])
for i in index_minor:
count = 0
if i in wrong_class:
......@@ -322,9 +321,6 @@ def class_washing(category, filepath, b=0.01):
if i in wrong_brand:
count += w_brand
final_df.loc[i, '品牌异常'] = 1
# if i in father_brand_minor:
# count += 1
# final_df.loc[i, '父品牌异常'] = 1
if i in not_in_name2:
count += w_name
final_df.loc[i, '产品名称异常'] = 1
......@@ -347,8 +343,4 @@ def class_washing(category, filepath, b=0.01):
if __name__ == '__main__':
category='扫描仪'
filepath="E:\\ZDZC\\扫描仪参数确认.xlsx"
#c_list=[6,7,-4,-3]
# category = '扫描仪'
# filepath="E:\\ZDZC\\扫描仪参数确认.xlsx"
# c_list=[7,8,9]
class_washing(category, filepath)
......@@ -10,16 +10,29 @@ import pandas as pd
import re
import numpy as np
import xlsxwriter
def product_washing(filepath, category,thre=1, a=0):
df_null = pd.read_excel(".\\异常数据表格.xlsx")
invalid_list = df_null['异常数据名称'].values
import pymssql
def product_washing(filepath, category, thre=1, a=0):
conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
database='ZI_NEW', autocommit=True)
cursor_zi_new = conn_zi_new.cursor()
cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='通用' and col_name='异常数据名称'")
invalid_list_fetch = cursor_zi_new.fetchall()
invalid_list = []
for invalid_tuple in invalid_list_fetch:
invalid_list.append(invalid_tuple[0])
df=pd.read_excel(filepath, converters={'产品编码':str})
df.drop(columns='Unnamed: 0', axis=1, inplace=True)
for col in df.columns:
df[col]=df[col].astype(str)
cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='{category}' and col_name='重复参数项'")
other_parameters_fetch=cursor_zi_new.fetchall()
other_parameters=[]
for param in other_parameters_fetch:
other_parameters.append(param[0])
related_product = []
brand_grouped = df.groupby(by='产品品牌')
for brand in brand_grouped:
......@@ -86,7 +99,7 @@ def product_washing(filepath, category,thre=1, a=0):
# brand_combined = temp_list1+temp_list2
tempo_dict[i] = [set(combined)]
other_parameters=df_null['重复参数项'][df_null['类别']==category][df_null['重复参数项'].notnull()].values
other_parameters_values=[]
for parameter in other_parameters:
other_parameters_values.append(brand[1].loc[i,parameter])
......
......@@ -162,5 +162,5 @@ def get_point_category_params_data(category):
conn_zi_new.close()
category = '扫描仪'
category = '激光打印机'
get_point_category_params_data(category)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment