Commit 4e536334 authored by rico.liu's avatar rico.liu

update stopword

parent e8970176
......@@ -10,6 +10,7 @@ from lxml import etree
import re
import requests
import json
import pymssql
def get_response(session,url,headers):
'''
......@@ -37,6 +38,12 @@ def get_response(session,url,headers):
def checkData(check_data):
conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database= 'zi_zh',autocommit=True)
cursor = conn.cursor()
cursor.execute('select stop_word from Stopwords')
data = (cursor.fetchall())
stopword_list = pd.DataFrame(data,columns=['stopword'])['stopword'].tolist()
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
session = requests.Session()
if check_data.empty:
......@@ -94,7 +101,13 @@ def checkData(check_data):
if "自营" in str(ziying):
name = html.xpath(
"//div[@class='sku-name']/text()")
if ("定制"in str(name)) or ("防弹"in str(name)) or ("射击"in str(name)) or ("订制"in str(name)) or ("卫星"in str(name)) or ("靶"in str(name)) or ("企业定制"in str(name)) or ("军迷"in str(name)) or ("携行具"in str(name)) or ("出版社"in str(name)) or ("书籍"in str(name)) or ("出版社" in str(brand)) or ("书籍" in str(subcategory)) or ("酒"in str(name) and "酒精" not in str(name)):
flag = False
for stopword in stopword_list:
if stopword in str(name):
flag = True
break
if flag:
print("定制/专用/书籍类产品暂不通过")
result.append("定制/专用/书籍类产品暂不通过")
id_all.append(date_id)
......@@ -184,7 +197,12 @@ def checkData(check_data):
if len(ziying) == 1:
name = html.xpath(
"//*[@id='gm-prd-main']/div[1]/h1/text()")
if ("定制"in str(name)) or ("防弹"in str(name)) or ("射击"in str(name)) or ("订制"in str(name)) or ("卫星"in str(name)) or ("靶"in str(name)) or ("企业定制"in str(name)) or ("出版社"in str(name)) or ("书籍"in str(name)) or ("出版社" in str(brand)) or ("书籍" in str(subcategory)) or ("酒"in str(name) and "酒精" not in str(name)):
flag = False
for stopword in stopword_list:
if stopword in str(name):
flag = True
break
if flag:
print("定制/专用/书籍类产品暂不通过")
result.append("定制/专用/书籍类产品暂不通过")
id_all.append(date_id)
......@@ -237,7 +255,7 @@ def checkData(check_data):
elif "suning" in str(main_url):
try:
#main_url = 'http://product.suning.com/0000000000/11673575307.html'
#main_url = 'http://product.suning.com/0000000000/10643583782.html'
sku = re.findall(".com/(.*?).html",main_url)[0]
main_url_ = 'https://product.suning.com/' + sku + '.html'
r = get_response(session,main_url_,headers)
......@@ -258,16 +276,15 @@ def checkData(check_data):
isPublished = itemDetail["isPublished"]
except:
isPublished = '0'
product_name = itemDetail["cmmdtyTitle"]
name = itemDetail["cmmdtyTitle"]
if isPublished == '1':
if '此款有货' in str(youhuo_) :
if ("定制"in str(product_name)) or ("防弹"in str(product_name)) or ("射击"in str(product_name)) \
or ("订制"in str(product_name)) or ("卫星"in str(product_name)) \
or ("靶"in str(product_name)) or ("企业定制"in str(product_name)) \
or ("军迷"in str(product_name)) or ("携行具"in str(product_name)) \
or ("出版社"in str(name)) or ("书籍"in str(name)) \
or ("出版社" in str(brand)) or ("书籍" in str(subcategory))\
or ("酒"in str(name) and "酒精" not in str(name)):
flag = False
for stopword in stopword_list:
if stopword in str(name):
flag = True
break
if flag:
print("定制/专用/书籍类产品暂不通过")
result.append("定制/专用/书籍类产品暂不通过")
price_list.append(sn_price)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment