Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
重
重点类信息提取
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ZGC_INDEX
重点类信息提取
Commits
4b96f5b3
Commit
4b96f5b3
authored
Mar 31, 2021
by
rico.liu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
69cdf41f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
15 additions
and
3 deletions
+15
-3
ZOL_Crawler.py
公共代码/ZOL_Crawler.py
+15
-3
No files found.
公共代码/ZOL_Crawler.py
View file @
4b96f5b3
...
...
@@ -144,6 +144,17 @@ class CRAWLER:
source
=
source
.
text
.
replace
(
'<br />'
,
''
)
html
=
etree
.
HTML
(
source
)
#get category and product name
try
:
category
=
html
.
xpath
(
"//div[@class='breadcrumb']/a[2]/text()"
)[
0
]
except
:
category
=
'无法获取类别'
try
:
product_name
=
html
.
xpath
(
"//div[@class='breadcrumb']/a[4]/text()"
)[
0
]
except
:
product_name
=
'无法获取产品名称'
#get Zol attribute and value
Zol_data
=
pd
.
DataFrame
()
attr_list
=
html
.
xpath
(
"//span[contains(@id,'newPmName')]/text()"
)
...
...
@@ -193,9 +204,10 @@ class CRAWLER:
break
if
get_value
==
''
:
get_data_list
.
append
(
"爬取不到数据"
)
res
=
dict
([(
k
,
v
)
for
k
,
v
in
zip
(
self
.
necessary_attrs
,
get_data_list
)])
res
=
{
'产品类别'
:
category
,
'产品名称'
:
product_name
}
res
.
update
(
dict
([(
k
,
v
)
for
k
,
v
in
zip
(
self
.
necessary_attrs
,
get_data_list
)]))
return
res
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment