-
Notifications
You must be signed in to change notification settings - Fork 0
/
V2_multipage_crawler_low_performance.py
112 lines (97 loc) · 4.12 KB
/
V2_multipage_crawler_low_performance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# encoding=utf8
from selenium import webdriver
import BeautifulSoup
import re
import time
import json
# Created by Gao Dashan 2017/4/27
# This crawler is based on simulating browser clicking behavior to turn to next page by utilizing selenium.
# Due to the slow displaying process of the browser, this program runs quite slow and occasionally appears
# unstable.
# The Multipage_cnblogs_clawler.py used urllib and is thus much faster.
def crwal(soup, page_index):
# param: soup: web page data to be analysised
# page_index: the page index of current page.
post_list_div = soup.find(attrs={"id": "post_list"})
titlelnk = post_list_div.findAll(attrs={"class": "titlelnk"})
post_item_summary = post_list_div.findAll(attrs={"class": "post_item_summary"})
post_item_foot = post_list_div.findAll(attrs={"class": "post_item_foot"})
lightblue = post_list_div.findAll(attrs={"class": "lightblue"})
article_comment = post_list_div.findAll(attrs={"class": "article_comment"})
article_view = post_list_div.findAll(attrs={"class": "article_view"})
# get the link index of next page
paging = soup.find(attrs={"id": "paging_block"})
next_pages = paging.findAll(attrs={"class": "pager"})
next_p = len(next_pages[0].contents)-1
# get the index of the current page
href = next_pages[0].contents[next_p]['href']
href_ind = href.lstrip('/sitehome/p/')
href_ind = str(int(href_ind) - 1)
length = len(titlelnk)
# create new reference to hold modified strings
title = titlelnk
time = post_item_foot
name = lightblue
comment = article_comment
view = article_view
summary = post_item_summary
# process data
for i in range(length):
title[i] = titlelnk[i].string.strip()
summary[i] = post_item_summary[i].contents[len(post_item_summary[i].contents) - 1].string.strip()
time[i] = post_item_foot[i].contents[2].strip()
name[i] = lightblue[i].string
comment[i] = article_comment[i].a.string
view[i] = article_view[i].a.string
# regular expression pattern: time_re : time regexp num_re : view and comment number regexp
time_re = re.compile(r'\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}')
num_re = re.compile(r'\d+')
# text format regulation
, for i in range(length):
time[i] = re.search(time_re, time[i]).group(0)
comment[i] = re.search(num_re, comment[i]).group(0)
view[i] = re.search(num_re, view[i]).group(0)
# Print data
print 'Title:', title[i], '\n', 'Abstract:', summary[i]
print 'Author:', name[i], '\t', 'Time:', time[i], '\t', 'Comment:', comment[i], '\t', 'View:', view[i], '\n'
# read data from cnblogs_data_from_CCTP.json
if page_index > 1:
with open('cnblogs_data_from_CCTP.json', 'r') as in_file:
data = json.load(in_file)
else:
data = {}
for i in range(len(time)):
# take the order of a record as its key
data[href_ind+'_%d' % i] = []
# append data to the key
data[href_ind+'_%d' % i].append({
'view': view[i],
'title': title[i],
'summary': summary[i],
'author': name[i],
'comment': comment[i],
'time': time[i]
})
with open('cnblogs_data_from_CCTP.json', 'w') as outfile:
json.dump(data, outfile)
# return: next_p is the input of itera(next_index). It is the index to the next page.
return next_p
def itera(next_page, pa):
print '-------------------This is a divider-------------\n'
st = '//*[@id="paging_block"]/div/a[' + '%d' % next_page + ']'
driver.find_element_by_xpath(st).click()
time.sleep(5)
webdata2 = driver.page_source
soup2 = BeautifulSoup.BeautifulSoup(webdata2)
return crwal(soup2, pa)
if __name__ == '__main__':
crawl_page_number = 4 # Crawl 4 pages here as illustration
page_index = 0
driver = webdriver.Chrome()
a = 'http://www.cnblogs.com'
driver.get(a)
webdata = driver.page_source
soup1 = BeautifulSoup.BeautifulSoup(webdata)
next_page = crwal(soup1, 1)
for i in range(crawl_page_number - 1):
next_page = itera(next_page, i + 2)