Commit fd535804 authored by Zhouxingyu's avatar Zhouxingyu

代码上传

parents
Pipeline #69 failed with stages
SKU,product_type,product_band,NAME
8146008,18K,梦金园,梦金园玫瑰18K金五角星芒吊坠不含链 定价
7887835,0.8,六福珠宝(LUKFOOK JEWELLERY),六福珠宝 18K金玫瑰色心形吊坠女款链坠不含项链 定价 L18TBKP0051R 总重约0.80克
4146484,0.60-0.79,周六福(ZLF),周六福珠宝 18K金女款玫瑰金吊坠心形蝴蝶 不含链KI040668 约0.60-0.79g
8999787,40cm,潮宏基(CHJ JEWELLERY),潮宏基 CHJ JEWELLERY 悦己-宝心 玛瑙贝壳18K金彩金项链 XQK33900010 约40cm加尾链
8788926,40cm,潮宏基(CHJ JEWELLERY),潮宏基 CHJ JEWELLERY 悦己-缘心 18K金彩金项链 XQK30000303 约40cm加尾链
8788920,42cm,潮宏基(CHJ JEWELLERY),潮宏基 CHJ JEWELLERY 哆啦A梦-站立 贝壳18K金彩金项链 XQK34200012 约42cm加尾链
8429247,90373N,周生生(CHOW SANG SANG),周生生CHOW SANG SANG 18K红色黄金Love Decode爱情密语星星项链 女款 90373N 47厘米
8352133,40cm,潮宏基(CHJ JEWELLERY),潮宏基 CHJ JEWELLERY 悦己-小幸运love 18K金彩金项链 XQK30000299 约40cm加尾链
8217452,89865N,周生生(CHOW SANG SANG),周生生CHOW SANG SANG 18K红色黄金Choker心形钻石项链89865N 43厘米
5605336,40-42cm,赛菲尔(Sunfeel),赛菲尔 18k金项链女款 K黄水波纹锁骨链 金链子细 彩金 约40-42cm
5483710,04800N18KY,周生生(CHOW SANG SANG),周生生CHOW SANG SANG 18K金黄色黄金项链百搭素链女款 04800N18KY 45厘米
5264080,E121035,周大福(CHOW TAI FOOK),周大福(CHOW TAI FOOK)七夕情人节礼物 时尚水波链 18K金项链 E121035 980 45cm
5113397,04800N18KY,周生生(CHOW SANG SANG),周生生CHOW SANG SANG 18K金黄色黄金项链百搭素链女款 04800N18KY 40厘米
4030285,18k,赛菲尔(Sunfeel),赛菲尔 18k金项链 女款O字十字K金项链 锁骨链女 金链子
\ No newline at end of file
sku,productcode
8146008,na
7887835,匹配5项以上。
4146484,na
8999787,匹配5项以上。
8788926,匹配5项以上。
8788920,匹配5项以上。
8429247,na
8352133,匹配5项以上。
8217452,na
5605336,na
5483710,na
5264080,na
5113397,na
4030285,na
This diff is collapsed.
import jieba
import re
import xlrd
import pandas as pd
import numpy as np
class brandd:
"""提取品牌名"""
def __init__(self, addr):
self.addr = addr
def contain_let(self, string):
"""判断字符串string中包含字母"""
return bool(re.search('[A-z]', string))
def contain_chin(self, string):
"""判断字符串string中是否包含汉字"""
zhmodel = re.compile(u'[\u4e00-\u9fa5]')
match = zhmodel.search(string)
if match:
boolValue = True
else:
boolValue = False
return boolValue
def read_in_1(self, addr_in):
"""从excel文件中读取数据
addr_in为缺失的csv文件地址"""
datafile = xlrd.open_workbook(addr_in)
table = datafile.sheets()[0]
matrix_text = pd.DataFrame([])
for i in range(table.ncols):
matrix_text[i] = table.col_values(i)
matrix_text.rename(columns=matrix_text.iloc[0, :], inplace=True)
matrix_text.drop([0], axis=0, inplace=True)
type_nonstand = matrix_text[['BAND_NAME']] # 匹配的非标准数据
BrandName_data = []
for i in range(len(type_nonstand)):
string_sin = jieba.cut(type_nonstand.iloc[i, 0])
for string in string_sin:
if self.contain_let(string) or self.contain_chin(string):
BrandName_data.append(string)
return list(set(BrandName_data))
def runing(self):
brandname = self.read_in_1(self.addr)
return brandname
from moudules import *
import pandas as pd
sqlserver = sql_find()
similar = similar()
def logic_function(table, price_name='jd_price'):
sku_lyst = []
price_lyst = []
m = 0
for productcode, sku, name, product_price in zip(table['库内已有产品'], table['SKU'], table['NAME'], table[price_name]):
try:
if productcode != 'NULL':
sku_label, symbol = left(productcode, sku, product_price) #symbol为0为没有修改的,为1为修改了的,为6为放入新品库的。
else:
sku_label, symbol = right(sku, str(name), product_price)
sku_lyst.append(sku_label)
if symbol == 1:
price_lyst.append(m)
m += 1
f=open('断点记录.txt','w', encoding="utf-8",errors='ignore')
f.write(f'{m}\n') #该文件‘断点记录.txt’第一项保存当前行数,第二项保存需要录入价格系统的行号。
for price in price_lyst: #如果中途中断,请对表提取第二项的行进入价格系统,并且删除前第一项的行。
f.write(f'{price}, ')
f.close()
except:
print(f'在第{m}条出错!')
print('操作完成!')
return sku_lyst, price_lyst
def left(productcode, sku, product_price):
state = sqlserver.state_extract(productcode)
if state == '1' or state == '2' or state == '4':
sub_class = True #此处添加判断子类是否正确的函数。
if sub_class:
symbol = sqlserver.state_update(productcode, state_num='1') #此处为在数据库中把状态置'1'。
price = sqlserver.price_get(productcode)
if price == '0' or price == 'NULL' or price == '' or price == 'None':
if product_price == '0' or product_price == 'NULL' or product_price == 'None' or product_price == '':
#price = get_price_by_url(sku) #此函数为利用sku爬取网站价格数据。
sqlserver.price_update(productcode, price)
else :
sqlserver.price_update(productcode, product_price)
return f'{sku}_补齐重要参数', symbol
else:
symbol = sqlserver.state_update(productcode, state_num='6')
sqlserver.sku_delete(sku)
return f'{sku}_放入新品建库', symbol
elif state == '6':
symbol = 6
return f'{sku}_放入新品建库', symbol
def right(sku, name, product_price):
if similar.istrue(name): #此处为similar类找到数据库中匹配的型号,返回True。
productcode = similar.productcode_get() #此处为提取与该型号所匹配产品的productcode。
left(productcode, sku, product_price)
else:
return f'{sku}_放入新品建库'
def table_save(table, lyst):
table = table[['库内已有产品', 'NAME', 'jd_price', 'sku_id', 'create_date']]
table = pd.concat([table, pd.DataFrame(columns=['商户编码'])])
table = table.reindex(columns=['商户编码', '库内已有产品', 'NAME', 'jd_price', 'sku_id', 'create_date'])
for i in range(len(table)):
table.iloc[i, 0] = 'DS-JD'
table.iloc[i, 4] = f"https://item.jd.com/{table.iloc[i, 4]}.html"
table = table.rename(columns={'库内已有产品': '产品编码', 'NAME': '产品名称', 'jd_price': '产品价格', 'sku_id': '产品链接', 'create_date': '创建时间'})
new_table = table.loc[lyst].reset_index(drop = True)
pre(new_table, '价格录入.csv')
def main():
table = data_load('new.xlsx').reset_index(drop = True)
data_list, price_list = logic_function(table)
try:
table_save(table, price_list)
data_list_buqi = []
data_list_xinping = []
for val in data_list:
if val.split('_')[-1] == '补齐重要参数':
data_list_buqi.append(val.split('_')[-2])
elif val.split('_')[-1] == '放入新品建库':
data_list_xinping.append(val.split('_')[-2])
print('补齐重要参数\n', data_list_buqi)
print('新品建库\n', data_list_xinping)
f=open('补齐重要参数.txt','a+', encoding="utf-8",errors='ignore')
for val in data_list_buqi:
f.write(val+"\n")
f.close()
f=open('新品建库.txt','a+', encoding="utf-8",errors='ignore')
for val in data_list_xinping:
f.write(val+"\n")
f.close()
print('完成!')
except:
print('出错,打印出sku标签列表和价格录入的行号!')
print(data_list)
print(price_list)
if __name__ == "__main__":
main()
# -*- coding:utf-8 -*-
import pymssql
import pandas as pd
import xlrd
from product_type_extract import *
from brand_name import brandd
import numpy as np
import re
def BN(str):
res = re.findall(r'[\u4E00-\u9FA5]', str)
new_res = ''.join(res)
#print(len(new_res))
if len(new_res) == 0:
res1 = re.findall(r'[a-zA-Z0-9]', str)
new_res = ''.join(res1)
return new_res
def data_load(path):
data = xlrd.open_workbook(path)
table = data.sheets()[0]
ncols = table.ncols
data2 = pd.DataFrame([])
for i in range(ncols):
data2[i] = table.col_values(i)
data2.rename(columns=data2.iloc[0, :], inplace=True)
data2.drop([0], axis=0, inplace=True)
return data2
#def judge_subclass()
def pre(matrix_text, addr):
"""将数据导出
matrix_text:导出数据
addr:导出地址
"""
matrix_text.to_csv(addr, sep=',', index=0, encoding='utf_8_sig', columns=matrix_text.columns)
class similar():
'''
查询44w条型号速度太慢,应该对'名称提取型号.csv'的查询优化算法。
'''
def __init__(self, brand_table='平安新品分类.9.17未匹配.xlsx'):
self.model1 = brandd(brand_table)
self.brandname = self.model1.runing()
def productcode_get(self):
return self.productcode
def istrue_database(self, type_name):
model = Extract(type_name, self.brandname)
type_single = model.runing()
judge = False
if type_single != 'na':
sql_find_1 = sql_find()
self.productcode = sql_find_1.type_search(type_single)
if self.productcode != None:
judge = True
return judge
class sql_find():
def __init__ (self):
self.conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='admin@2018@)!*', database='ZI_DataBase', autocommit=True)
self.cursor = self.conn.cursor()
def state_extract(self, productcode, database_name='Info_Product', state_name='State'):
'''
根据pcode提取状态。
'''
self.cursor.execute(f"select {state_name} from {database_name} where ProductCode = '{str(productcode)}'")
try : n = self.cursor.fetchone()[0]
except TypeError:
n = self.cursor.fetchone()
return n
def price_get(self, productcode, database_name='ZI_Price_Quote', name='MaximumPrice'):
'''
根据pcode提取价格。
'''
self.cursor.execute(f"select {name} from {database_name} where ProductCode = '{str(productcode)}'")
try: n = str(self.cursor.fetchone()[0])
except TypeError:
n = self.cursor.fetchone()
return n
def state_update(self, productcode, state_num, database_name='Info_Product', state_name='State'):
'''
状态字更新。
'''
self.cursor.execute(f"select {state_name} from {database_name} where ProductCode ='{str(productcode)}'")
n = self.cursor.fetchone()[0]
if n != state_num:
self.cursor.execute(f"update {database_name} set {state_name}='{state_num}' where ProductCode ='{str(productcode)}'")
print(f'{productcode} 状态已由 {n} 更改为 {state_num} 。')
if state_num == '1':
return 1
elif state_num == '6':
return 6
return 0
def price_update(self, productcode, price, database_name='ZI_Price_Quote', state_name='MaximumPrice'):
'''
价格更新。
'''
self.cursor.execute(f"update {database_name} set {MaximumPrice}='{price}' where ProductCode ='{str(productcode)}'")
print(f'{productcode} 价格已经更改为 {price} 。')
def sku_delete(self, sku, database_name='productcode_sku'):
'''
删除sku。
'''
self.cursor.execute(f"delete from {database_name} where sku ='{sku}'")
def type_search(self, product_type, database_name='product_type', state_name='productcode'):
self.cursor.execute(f"select {state_name} from {database_name} where product_type = '{str(product_type)}'")
n = self.cursor.fetchall()
return n
def brand_search(self, productcode):
self.cursor.execute(f"select BrandCode from info_product where productcode = '{str(productcode)}'")
n = self.cursor.fetchone()[0][4:]
self.cursor.execute(f"select brandname from zi_brand_list where BrandID = '{str(n)}'")
n = self.cursor.fetchone()[0]
return n
import jieba
import re
import xlrd
import pandas as pd
import numpy as np
class Extract:
def __init__(self, string_non, BrandName):
self.string_non = string_non
self.BrandName = BrandName
def contain_num(self, string):
"""判断字符串string中包含数字"""
return bool(re.search('[0-9]', string))
def contain_let(self, string):
"""判断字符串string中包含字母"""
return bool(re.search('[A-z]', string))
def contain_chin(self, string):
"""判断字符串string中是否包含汉字"""
zhmodel = re.compile(u'[\u4e00-\u9fa5]')
match = zhmodel.search(string)
if match:
boolValue = True
else:
boolValue = False
return boolValue
def contain_sym(self, string):
"""判断字符串string中包含'+'和'-'"""
return bool(re.search('[-+]', string))
def select_num(self, string):
"""挑选字符串中的数字"""
num = re.findall(r"\d+\.?\d*", string)
return num[0]
def seek_type_maybe(self, str_single):
"""挑选可能的型号字符串"""
str_goal = jieba.cut(str_single)
word_goal = ""
for word in str_goal:
word_goal += word + " "
sym = ('+', ';', ':', '(', '(', ')', ')', ",", ',', '【', '】')
word_goal = ''.join(c for c in word_goal if c not in sym) # 去掉标点符号
vec = word_goal.split()
type_maybe = {}
for i in range(len(vec)):
if vec[i] == '-':
type_maybe[i] = vec[i - 1] + '-' + vec[i + 1]
elif vec[i] == '+':
type_maybe[i] = vec[i - 1] + '+' + vec[i + 1]
else:
type_maybe[i] = vec[i]
return type_maybe
def select_one(self, type_in):
"""挑选含数字的字符串
type_in数据类型为字典"""
type_out = {}
for key in type_in:
if self.contain_num(type_in[key]):
type_out[key] = type_in[key]
return type_out
def select_two(self, type_in):
"""挑选含字符串长度在三到十
type_in数据类型为字典"""
type_out = {}
for key in type_in:
if (len(type_in[key]) >= 3)and(len(type_in[key]) <= 10):
type_out[key] = type_in[key]
return type_out
def select_three(self, type_in, type_st):
"""挑选品牌名后面的字符串
返回的是一个值"""
type_out = []
for key in type_in:
index = key
if index == 1:
if type_st[index-1] in self.BrandName:
type_out = type_in[key]
break
elif index >= 2:
if (type_st[index - 1] in self.BrandName)and(type_st[index - 2] in self.BrandName):
type_out = type_in[key]
break
return type_out
def select_four(self, type_in, type_st):
"""挑选型号后面的字符串"""
type_product = []
for i in range(1, len(type_in)):
index = list(type_in.items())[i][0]
if (type_st[index - 1] == '型号')and(type_st[index - 2] == '型号'):
type_product.append(type_st[i])
return list(set(type_product))
def select_five(self, type_mul):
"""挑选带有'+'或者'-'的字符串"""
type_sin = []
for key in type_mul:
if self.contain_sym(type_mul[key]):
type_sin.append(type_mul[key]) # 可能有多个带有'+'或者'-'的可能型号
break
return type_sin
def select_six(self, type_mul):
"""根据数字挑选字符串(重复的数字或者数字较多的字符串)"""
type_select = []
if len(type_mul) >= 2:
for key_i in type_mul:
for key_j in type_mul:
num_i = self.select_num(type_mul[key_i])
num_j = self.select_num(type_mul[key_j])
if num_i == num_j and key_i != key_j and len(type_mul[key_i]) >= len(type_mul[key_j]):
type_select = type_mul[key_j]
# break
else:
type_len = {}
for key in type_mul:
type_len[key] = self.select_num(type_mul[key])
index_max = max(type_len, key=type_len.get)
type_select = type_mul[index_max]
# type_select = type_mul
return type_select
def runing(self):
global type_final_out
output1 = self.seek_type_maybe(self.string_non) # 字符串分词,挑选可能的型号
output2 = self.select_one(output1) # 挑选含数字的字符串
output3 = self.select_two(output2) # 挑选长度在三到十的字符串
if len(output3) == 1:
type_final = list(output3.items())[0][1]
else:
output4 = self.select_three(output3, output1) # 挑选品牌名后面的字符串
if len(output4) == 1:
type_final = output4
else:
output5 = self.select_five(output3) # 挑选带有'+'或者'-'的字符串
if len(output5) == 1:
type_final = output5
else:
output6 = self.select_four(output3, output1) # 挑选型号后面的字符串
if len(output6) == 1:
type_final = output6
else:
output7 = self.select_six(output3)
if len(output7) == 1:
type_final = output7
else:
type_final =output3
if len(type_final) == 0:
type_final_out = 'na'
elif type(type_final) == str:
type_final_out = type_final
elif len(type_final) == 1 and type(type_final) == list:
type_final_out = type_final[0]
elif len(type_final) == 1 and type(type_final) == dict:
type_final_out = list(type_final.items())[0][1]
elif len(type_final) >= 2 and type(type_final) == dict:
type_len = {}
for key in type_final:
type_len[key] = len(type_final[key])
index = max(type_len, key=type_len.get)
type_final_out = type_final[index]
return type_final_out
def read_in(addr_in):
"""从excel文件中读取数据
addr_in为缺失的csv文件地址"""
datafile = xlrd.open_workbook(addr_in)
table = datafile.sheets()[0]
matrix_text = pd.DataFrame([])
for i in range(table.ncols):
matrix_text[i] = table.col_values(i)
matrix_text.rename(columns=matrix_text.iloc[0, :], inplace=True)
matrix_text.drop([0], axis=0, inplace=True)
type_nonstand = matrix_text[['productname']] # 匹配的非标准数据
return list(np.array(type_nonstand))
def out_data(data, addr):
columns = 'product_type'
data.insert(0, columns)
pd.DataFrame(data).to_csv(addr, index=0)
from moudules import *
'''
table = pd.read_csv('名称提取型号_test.csv').reset_index(drop = True)
print(table.iloc[0][1])
'''
abc = '佑游泳镜泳帽男女防水防雾游泳耳塞鼻夹眼镜高清泳镜套装Z6615电镀粉色平光'
similar = similar()
sqlserver = sql_find()
print(sqlserver.brand_search(6612086630086))
'''
if similar.istrue_database(abc):
print(similar.productcode_get())
else: print('na')
'''
from moudules import *
import pandas as pd
class Index(object):
def __init__(self, number=50, decimal=2):
"""
:param decimal: 你保留的保留小数位
:param number: # 号的 个数
"""
self.decimal = decimal
self.number = number
self.a = 100/number # 在百分比 为几时增加一个 # 号
def __call__(self, now, total):
# 1. 获取当前的百分比数
percentage = self.percentage_number(now, total)
# 2. 根据 现在百分比计算
well_num = int(percentage / self.a)
# print("well_num: ", well_num, percentage)
# 3. 打印字符进度条
progress_bar_num = self.progress_bar(well_num)
# 4. 完成的进度条
result = "\r%s %s" % (progress_bar_num, percentage)
return result
def percentage_number(self, now, total):
"""
计算百分比
:param now: 现在的数
:param total: 总数
:return: 百分
"""
return round(now / total * 100, self.decimal)
def progress_bar(self, num):
"""
显示进度条位置
:param num: 拼接的 “#” 号的
:return: 返回的结果当前的进度条
"""
# 1. "#" 号个数
well_num = "#" * num
# 2. 空格的个数
space_num = " " * (self.number - num)
return '[%s%s]' % (well_num, space_num)
def flip90_right(arr):
new_arr = arr.reshape(arr.size)
new_arr = new_arr[::-1]
new_arr = new_arr.reshape(arr.shape)
new_arr = np.transpose(new_arr)[::-1]
return new_arr
def out_data(sku, pcode, addr):
columns = ['productcode', 'sku']
pcode.insert(0, columns[0])
sku.insert(0, columns[1])
data = np.array([pcode, sku])
data = flip90_right(data)
pd.DataFrame(data).to_csv(addr, header=0, index=0, encoding='utf_8_sig')
print('请保证型号待匹配表表头包含:‘SKU’,‘product_type’,‘NAME’。')
table_name = input('请输入型号待匹配表的路径:')
save_name = input('请输入保存匹配后的表的路径:')
index = Index()
similar = similar()
sqlserver = sql_find()
table = pd.read_csv(table_name)
SKU_lyst = list(table['SKU'])
l = len(SKU_lyst)
m = 0
sku2pcode_lyst = []
for sku, NAME, brand in zip(table['SKU'], table['NAME'], table['product_band']):
try:
if similar.istrue_database(NAME): #此处为similar类找到数据库中匹配的型号,返回True。
pcodelyst = []
productcode_lyst = similar.productcode_get() #此处为提取与该型号所匹配产品的productcode。
#print(brand)
#print(sqlserver.brand_search(productcode_lyst[0][0]))
if len(productcode_lyst) < 10:
for i in range(len(productcode_lyst)):
if BN(brand) == BN(sqlserver.brand_search(productcode_lyst[i][0])):
pcodelyst.append(productcode_lyst[i][0])
if len(pcodelyst) == 0:
sku2pcode = 'na'
elif len(pcodelyst) < 5:
sku2pcode = str(','.join(pcodelyst))
print(sku2pcode)
elif len(pcodelyst) > 5:
sku2pcode = '匹配5项以上。'
else:
sku2pcode = '匹配5项以上。'
else:
sku2pcode = 'na'
except:
print(f'\n数据库连接不稳定。SKU{sku}产品出错,请人工核实此条。')
sku2pcode = 'error'
print(index(m, l-1), end='%')
m += 1
sku2pcode_lyst.append(sku2pcode)
out_data(SKU_lyst, sku2pcode_lyst, '2.csv')
print('\n完成!')
import pymssql
import pandas as pd
def pre(matrix_text, addr):
"""将数据导出
matrix_text:导出数据
addr:导出地址
"""
matrix_text.to_csv(addr, sep=',', index=0, encoding='utf_8_sig', columns=matrix_text.columns)
conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database='ZI_DataBase')
cur = conn.cursor()
sql_1 = "select productcode,productname,categoryid,subcategorycode from info_product where state <> '6' "
cur.execute(sql_1)
data = (cur.fetchall())
columns = [desc[0] for desc in cur.description]
data_1 = pd.DataFrame(data, columns=columns)
pre(data_1, '产品名称.csv')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment