Commit 4e536334 authored by rico.liu's avatar rico.liu

update stopword

parent e8970176
...@@ -10,6 +10,7 @@ from lxml import etree ...@@ -10,6 +10,7 @@ from lxml import etree
import re import re
import requests import requests
import json import json
import pymssql
def get_response(session,url,headers): def get_response(session,url,headers):
''' '''
...@@ -37,6 +38,12 @@ def get_response(session,url,headers): ...@@ -37,6 +38,12 @@ def get_response(session,url,headers):
def checkData(check_data): def checkData(check_data):
conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database= 'zi_zh',autocommit=True)
cursor = conn.cursor()
cursor.execute('select stop_word from Stopwords')
data = (cursor.fetchall())
stopword_list = pd.DataFrame(data,columns=['stopword'])['stopword'].tolist()
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'} headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
session = requests.Session() session = requests.Session()
if check_data.empty: if check_data.empty:
...@@ -94,7 +101,13 @@ def checkData(check_data): ...@@ -94,7 +101,13 @@ def checkData(check_data):
if "自营" in str(ziying): if "自营" in str(ziying):
name = html.xpath( name = html.xpath(
"//div[@class='sku-name']/text()") "//div[@class='sku-name']/text()")
if ("定制"in str(name)) or ("防弹"in str(name)) or ("射击"in str(name)) or ("订制"in str(name)) or ("卫星"in str(name)) or ("靶"in str(name)) or ("企业定制"in str(name)) or ("军迷"in str(name)) or ("携行具"in str(name)) or ("出版社"in str(name)) or ("书籍"in str(name)) or ("出版社" in str(brand)) or ("书籍" in str(subcategory)) or ("酒"in str(name) and "酒精" not in str(name)): flag = False
for stopword in stopword_list:
if stopword in str(name):
flag = True
break
if flag:
print("定制/专用/书籍类产品暂不通过") print("定制/专用/书籍类产品暂不通过")
result.append("定制/专用/书籍类产品暂不通过") result.append("定制/专用/书籍类产品暂不通过")
id_all.append(date_id) id_all.append(date_id)
...@@ -184,7 +197,12 @@ def checkData(check_data): ...@@ -184,7 +197,12 @@ def checkData(check_data):
if len(ziying) == 1: if len(ziying) == 1:
name = html.xpath( name = html.xpath(
"//*[@id='gm-prd-main']/div[1]/h1/text()") "//*[@id='gm-prd-main']/div[1]/h1/text()")
if ("定制"in str(name)) or ("防弹"in str(name)) or ("射击"in str(name)) or ("订制"in str(name)) or ("卫星"in str(name)) or ("靶"in str(name)) or ("企业定制"in str(name)) or ("出版社"in str(name)) or ("书籍"in str(name)) or ("出版社" in str(brand)) or ("书籍" in str(subcategory)) or ("酒"in str(name) and "酒精" not in str(name)): flag = False
for stopword in stopword_list:
if stopword in str(name):
flag = True
break
if flag:
print("定制/专用/书籍类产品暂不通过") print("定制/专用/书籍类产品暂不通过")
result.append("定制/专用/书籍类产品暂不通过") result.append("定制/专用/书籍类产品暂不通过")
id_all.append(date_id) id_all.append(date_id)
...@@ -237,7 +255,7 @@ def checkData(check_data): ...@@ -237,7 +255,7 @@ def checkData(check_data):
elif "suning" in str(main_url): elif "suning" in str(main_url):
try: try:
#main_url = 'http://product.suning.com/0000000000/11673575307.html' #main_url = 'http://product.suning.com/0000000000/10643583782.html'
sku = re.findall(".com/(.*?).html",main_url)[0] sku = re.findall(".com/(.*?).html",main_url)[0]
main_url_ = 'https://product.suning.com/' + sku + '.html' main_url_ = 'https://product.suning.com/' + sku + '.html'
r = get_response(session,main_url_,headers) r = get_response(session,main_url_,headers)
...@@ -258,16 +276,15 @@ def checkData(check_data): ...@@ -258,16 +276,15 @@ def checkData(check_data):
isPublished = itemDetail["isPublished"] isPublished = itemDetail["isPublished"]
except: except:
isPublished = '0' isPublished = '0'
product_name = itemDetail["cmmdtyTitle"] name = itemDetail["cmmdtyTitle"]
if isPublished == '1': if isPublished == '1':
if '此款有货' in str(youhuo_) : if '此款有货' in str(youhuo_) :
if ("定制"in str(product_name)) or ("防弹"in str(product_name)) or ("射击"in str(product_name)) \ flag = False
or ("订制"in str(product_name)) or ("卫星"in str(product_name)) \ for stopword in stopword_list:
or ("靶"in str(product_name)) or ("企业定制"in str(product_name)) \ if stopword in str(name):
or ("军迷"in str(product_name)) or ("携行具"in str(product_name)) \ flag = True
or ("出版社"in str(name)) or ("书籍"in str(name)) \ break
or ("出版社" in str(brand)) or ("书籍" in str(subcategory))\ if flag:
or ("酒"in str(name) and "酒精" not in str(name)):
print("定制/专用/书籍类产品暂不通过") print("定制/专用/书籍类产品暂不通过")
result.append("定制/专用/书籍类产品暂不通过") result.append("定制/专用/书籍类产品暂不通过")
price_list.append(sn_price) price_list.append(sn_price)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment