Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Z
ZHOUXINGYU_project
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhou
ZHOUXINGYU_project
Commits
a283447e
Commit
a283447e
authored
Jan 19, 2020
by
sanlu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
爬取结束后,直接调用crawl_data_run()即可
parent
6cc425dc
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
298 additions
and
0 deletions
+298
-0
API_ALL.py
API_ALL.py
+295
-0
main_merge.py
main_merge.py
+3
-0
No files found.
API_ALL.py
0 → 100644
View file @
a283447e
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 19 10:30:51 2020
@author: Administrator
"""
import
pymssql
import
pandas
as
pd
from
lxml
import
etree
import
re
import
requests
import
json
import
time
import
datetime
import
os
def
Get_new
():
#获取未放入product_all表中数据,并将数据存入product_all表中.
conn
=
pymssql
.
connect
(
host
=
'123.56.115.207'
,
user
=
'zgcprice3311'
,
password
=
'admin@2018@)!*'
,
database
=
'reverse_data'
,
autocommit
=
True
)
cur
=
conn
.
cursor
()
cur
.
execute
(
"select * from product where isdo is null "
)
#cur.execute("SELECT * FROM product_zh where state = '1' and productcode is NULL and remark is null and isdo='0'")
res
=
[
item
for
item
in
cur
.
fetchall
()]
df
=
pd
.
DataFrame
(
res
,
columns
=
[
tuple
[
0
]
for
tuple
in
cur
.
description
])
for
i
in
range
(
len
(
df
)):
#i = 1
dt
=
df
.
loc
[
i
]
dt_id
=
dt
.
product_id
dt_sku
=
dt
.
channel_sku
dt_name_
=
dt
.
product_name
if
'
\'
'
in
dt_name_
:
dt_name
=
dt_name_
.
replace
(
'
\'
'
,
'‘'
)
else
:
dt_name
=
dt_name_
dt_sub
=
dt
.
channel_product_classify
dt_brand
=
dt
[
'brand'
]
.
strip
(
''
)
dt_url
=
dt
.
channel_product_id
dt_sku
=
dt
.
channel_sku
dt_frm
=
dt
.
channel_id
cur
.
execute
(
f
"update product set isdo='1' where product_id = '{dt_id}'"
)
#标识符
in_product_all_sql
=
f
"insert into product_all(sku,name,brand,category,url,source,product_id)
\
values ('{dt_sku}','{dt_name}','{dt_brand}','{dt_sub}','{dt_url}','{dt_frm}','{dt_id}')"
cur
.
execute
(
in_product_all_sql
)
conn
.
close
()
cur
.
close
()
return
None
def
get_reponse
(
session
,
url
,
headers
):
'''
deal timeout request
'''
network_status
=
True
try
:
response
=
session
.
get
(
url
,
headers
=
headers
,
timeout
=
5
)
if
response
.
status_code
==
200
:
return
response
except
:
network_status
=
False
if
network_status
==
False
:
'''timeout'''
for
i
in
range
(
1
,
10
):
print
(
'请求超时,第
%
s次重复请求'
%
i
)
try
:
response
=
session
.
get
(
url
,
headers
=
headers
,
timeout
=
5
)
if
response
.
status_code
==
200
:
return
response
except
:
continue
return
-
1
def
check_and_match
():
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
session
=
requests
.
Session
()
conn
=
pymssql
.
connect
(
host
=
'123.56.115.207'
,
user
=
'zgcprice3311'
,
password
=
'admin@2018@)!*'
,
database
=
'reverse_data'
,
autocommit
=
True
)
cur
=
conn
.
cursor
()
cur
.
execute
(
"select * from product_all where state is null "
)
ress
=
[
item
for
item
in
cur
.
fetchall
()]
all_data
=
pd
.
DataFrame
(
ress
,
columns
=
[
tuple
[
0
]
for
tuple
in
cur
.
description
])
for
i
in
range
(
len
(
all_data
)):
#i = 0
state_in
=
'未成功处理'
ziying_in
=
'未成功处理'
dingzhi_in
=
'未成功处理'
dh
=
all_data
.
loc
[
i
]
dh_id
=
dh
.
id
dh_url
=
dh
.
url
dh_frm
=
dh
.
source
dh_sku
=
dh
.
sku
if
'JD'
in
dh_frm
or
'SN'
in
dh_frm
or
'GM'
in
dh_frm
:
if
"JD"
in
str
(
dh_frm
):
try
:
#dh_url = 'https://item.jd.com/100000483493.html'
try_
=
session
.
get
(
dh_url
,
headers
=
headers
)
sku
=
dh_sku
url
=
"https://p.3.cn/prices/mgets?skuIds="
+
str
(
sku
)
r
=
session
.
get
(
url
,
headers
=
headers
)
.
json
()
jd_price
=
r
[
0
][
'p'
]
if
jd_price
==
'-1.00'
:
state_in
=
'0'
else
:
state_in
=
'1'
#main_url_ = "https://item.jd.com/" + sku + ".html"
r_
=
session
.
get
(
dh_url
,
headers
=
headers
)
html
=
etree
.
HTML
(
r_
.
text
)
ziying
=
html
.
xpath
(
"//div[@class='name goodshop EDropdown']/em/text()"
)
if
"自营"
in
str
(
ziying
):
ziying_in
=
'1'
name
=
html
.
xpath
(
"//div[@class='sku-name']/text()"
)
if
(
"定制"
in
str
(
name
))
or
(
"防弹"
in
str
(
name
))
or
(
"射击"
in
str
(
name
))
or
(
"订制"
in
str
(
name
))
or
(
"卫星"
in
str
(
name
))
or
(
"靶"
in
str
(
name
))
or
(
"企业定制"
in
str
(
name
))
or
(
"军迷"
in
str
(
name
))
or
(
"携行具"
in
str
(
name
)):
dingzhi_in
=
'0'
else
:
dingzhi_in
=
'1'
url
=
"https://c0.3.cn/stock?skuId="
+
str
(
sku
)
+
"&area=1_2901_2906_0&cat=9987,653,655"
r
=
get_reponse
(
session
,
url
,
headers
)
if
r
==
-
1
:
state
=
'0'
else
:
r
.
encoding
=
'gbk'
is_purchase
=
json
.
loads
(
r
.
text
)
try
:
if
"无货"
in
is_purchase
[
'stock'
][
'stockDesc'
]
or
"无货"
in
is_purchase
[
'stock'
][
'StockStateName'
]:
state_in
=
'0'
else
:
state_in
=
'1'
except
:
if
"无货"
in
is_purchase
[
'StockStateName'
]:
state_in
=
'0'
else
:
state_in
=
'1'
else
:
ziying_in
=
'0'
except
:
state_in
=
'5'
ziying_in
=
'5'
dingzhi_in
=
'5'
elif
"GM"
in
str
(
dh_frm
):
try
:
r
=
session
.
get
(
dh_url
,
headers
=
headers
)
html
=
etree
.
HTML
(
r
.
text
)
content
=
html
.
xpath
(
"//script[contains(text(),'gomePrice')]/text()"
)[
0
]
ziying
=
html
.
xpath
(
"//span[@class='identify']/text()"
)
if
len
(
ziying
)
==
1
:
ziying_in
=
'1'
name
=
html
.
xpath
(
"//*[@id='gm-prd-main']/div[1]/h1/text()"
)
if
(
"定制"
in
str
(
name
))
or
(
"防弹"
in
str
(
name
))
or
(
"射击"
in
str
(
name
))
or
(
"订制"
in
str
(
name
))
or
(
"卫星"
in
str
(
name
))
or
(
"靶"
in
str
(
name
))
or
(
"企业定制"
in
str
(
name
)):
dingzhi_in
=
'0'
else
:
dingzhi_in
=
'1'
url
=
"https://ss.gome.com.cn/item/v1/d/m/store/unite/"
+
str
(
sku
)
+
"/N/11010200/110102002/1/null/flag/item/allStores?callback=allStores"
r
=
session
.
get
(
url
,
headers
=
headers
)
content
=
r
.
text
.
replace
(
'allStores('
,
''
)
content
=
content
.
replace
(
')'
,
''
)
content
=
json
.
loads
(
content
)
wuhuo
=
content
[
'result'
][
'stock'
][
'status'
]
if
wuhuo
==
False
:
state_in
=
'0'
else
:
state_in
=
'1'
else
:
ziying_in
=
'0'
except
:
state_in
=
'5'
ziying_in
=
'5'
dingzhi_in
=
'5'
elif
"SN"
in
str
(
dh_frm
):
try
:
#dh_url = 'https://product.suning.com/0000000000/10118850129.html'
r
=
session
.
get
(
dh_url
,
headers
=
headers
)
html
=
etree
.
HTML
(
r
.
text
)
daaa
=
r
.
text
str2
=
html
.
xpath
(
"//input[@id='curPartNumber']/@value"
)[
0
]
ziying1
=
html
.
xpath
(
"//div[@class='proinfo-title']/h1/span/i/text()"
)
ziying2
=
html
.
xpath
(
"//h1[@id='itemDisplayName']/span/text()"
)
youhuo_
=
re
.
findall
(
"id=
\"
ie7_onsale
\"
>(.*?)<i"
,
daaa
)
if
"自营"
in
ziying1
or
"自营"
in
ziying2
:
ziying_in
=
'1'
daohuo
=
html
.
xpath
(
"//a[@id='tellMe']/span/text()"
)
url_json
=
f
'https://product.suning.com/pds-web/ajax/itemUniqueInfo_{str(str2)}_0000000000.html'
response_json
=
session
.
get
(
url_json
,
headers
=
headers
)
json_data
=
json
.
loads
(
response_json
.
text
)
itemDetail
=
json_data
[
"itemDetail"
]
try
:
isPublished
=
itemDetail
[
"isPublished"
]
except
:
isPublished
=
'0'
product_name
=
itemDetail
[
"cmmdtyTitle"
]
if
isPublished
==
'1'
:
state_in
=
'1'
if
(
"定制"
in
str
(
product_name
))
or
(
"防弹"
in
str
(
product_name
))
or
(
"射击"
in
str
(
product_name
))
\
or
(
"订制"
in
str
(
product_name
))
or
(
"卫星"
in
str
(
product_name
))
\
or
(
"靶"
in
str
(
product_name
))
or
(
"企业定制"
in
str
(
product_name
))
\
or
(
"军迷"
in
str
(
product_name
))
or
(
"携行具"
in
str
(
product_name
)):
dingzhi_in
=
'0'
else
:
dingzhi_in
=
'1'
else
:
state_in
=
'0'
else
:
ziying_in
=
'0'
except
:
state_in
=
'5'
ziying_in
=
'5'
dingzhi_in
=
'5'
else
:
state_in
=
'1'
ziying_in
=
'1'
dingzhi_in
=
'1'
#print(str(i)+'+++++++')
#print('有货 '+str(state_in))
#print('自营 '+str(ziying_in))
#print('定制 '+str(dingzhi_in))
cur
.
execute
(
f
"update product_all set state='{state_in}',ziying='{ziying_in}',dingzhi = '{dingzhi_in}' where id = '{dh_id}' "
)
conn
.
close
()
cur
.
close
()
#获取SKU进行匹配
conn
=
pymssql
.
connect
(
host
=
'123.56.115.207'
,
user
=
'zgcprice3311'
,
password
=
'admin@2018@)!*'
,
database
=
'reverse_data'
,
autocommit
=
True
)
cur
=
conn
.
cursor
()
conn
=
pymssql
.
connect
(
host
=
'123.56.115.207'
,
user
=
'zgcprice3311'
,
password
=
'admin@2018@)!*'
,
database
=
'ZI_DataBase'
,
autocommit
=
True
)
cursor
=
conn
.
cursor
()
match_data_sql
=
"select * from product_all where source in ('JD','SN','GM')"
cur
.
execute
(
match_data_sql
)
resss
=
[
item
for
item
in
cur
.
fetchall
()]
match_data
=
pd
.
DataFrame
(
resss
,
columns
=
[
tuple
[
0
]
for
tuple
in
cur
.
description
])
for
j
in
range
(
len
(
match_data
)):
#j=0
#dg_sku = '100000483493'
#dg_frm = 'JD'
dg
=
match_data
.
loc
[
j
]
dg_id
=
dg
.
id
dg_sku
=
dg
.
sku
dg_frm
=
dg
.
source
if
dg_frm
==
'SN'
:
dg_skuu
=
'0000000000/'
+
dg_sku
check_sql
=
f
"select productcode from productcode_sku where frm = '{dg_frm}' and sku='{dg_sku}'"
cursor
.
execute
(
check_sql
)
out_data
=
cursor
.
fetchall
()
if
len
(
out_data
)
==
0
:
check_sql_1
=
f
"select productcode from productcode_sku where frm = '{dg_frm}' and sku= '{dg_skuu}'"
cursor
.
execute
(
check_sql_1
)
out_data
=
cursor
.
fetchall
()
else
:
out_code
=
out_data
if
len
(
out_data
)
==
0
:
out_code
=
'未匹配上'
elif
len
(
out_data
)
>
1
:
out_code
=
'一个SKU匹配多个编码'
else
:
out_code
=
out_data
[
0
][
0
]
else
:
check_sql
=
f
"select productcode from productcode_sku where frm = '{dg_frm}' and sku='{dg_sku}'"
cursor
.
execute
(
check_sql
)
out_data
=
cursor
.
fetchall
()
if
len
(
out_data
)
==
0
:
out_code
=
'未匹配上'
elif
len
(
out_data
)
>
1
:
out_code
=
'一个SKU匹配多个编码'
else
:
out_code
=
out_data
[
0
][
0
]
if
len
(
out_code
)
==
13
or
out_code
==
'一个SKU匹配多个编码'
:
update_sql
=
f
"update product_all set productcode='{out_code}' where id = '{dg_id}'"
cur
.
execute
(
update_sql
)
else
:
pass
print
(
'完成'
)
\ No newline at end of file
main_merge.py
View file @
a283447e
...
@@ -16,6 +16,7 @@ from lstm_predict import LSTMNER
...
@@ -16,6 +16,7 @@ from lstm_predict import LSTMNER
import
os
import
os
from
ZOL_Crawler
import
CRAWLER
from
ZOL_Crawler
import
CRAWLER
import
threading
import
threading
from
API_ALL
import
Get_new
,
check_and_match
exitFlag
=
0
exitFlag
=
0
...
@@ -998,6 +999,8 @@ class crawl_data_fetch():
...
@@ -998,6 +999,8 @@ class crawl_data_fetch():
return
False
return
False
def
crawl_data_run
():
def
crawl_data_run
():
Get_new
()
check_and_match
()
#张楷部分。
thread_JD
=
myThread_crawl
(
'JD'
)
thread_JD
=
myThread_crawl
(
'JD'
)
thread_GM
=
myThread_crawl
(
'GM'
)
thread_GM
=
myThread_crawl
(
'GM'
)
thread_SN
=
myThread_crawl
(
'SN'
)
thread_SN
=
myThread_crawl
(
'SN'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment