一、思路

主要使用xpath对数据解析,urllib库对数据爬取,openpyxl库将数据进行写入Excel表格

二、代码

直接上代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import urllib.request
import urllib.parse
from lxml import etree
from openpyxl import Workbook

#https://s.weibo.com/weibo/%23山西暴雨%23&page=1
def topic_create_request(page):
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49',
'cookie':'PC_TOKEN=fc6bbdffa0; login_sid_t=ea94e0e4632a5bd78f1e7c8d0bac86ca; cross_origin_proto=SSL; _s_tentry=cn.bing.com; UOR=cn.bing.com,weibo.com,cn.bing.com; Apache=1531283071038.816.1657941322931; SINAGLOBAL=1531283071038.816.1657941322931; ULV=1657941322934:1:1:1:1531283071038.816.1657941322931:; WBtopGlobal_register_version=2022071611; SSOLoginState=1657941525; SUB=_2A25P1l5FDeRhGedG61cZ9yrEzjuIHXVtOWINrDV8PUJbkNAKLXDdkW1NUZD1z40aAVyb8FaxE84OO2us1ukOJcAu; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFOJa89QcLwxaa-11YEbBHM5NHD95Qp1h5f1hMX1h-NWs4DqcjVi--ciKn4iKyFi--ciKLhi-iWi--NiK.Xi-2Ri--ciKnRi-zNeKn7SKnNShnfS7tt'
}
base_data={
'q': '#山西暴雨#',
'page': str(page),
}
base_url='https://s.weibo.com/weibo?'
data=urllib.parse.urlencode(base_data)
url=base_url+data
request=urllib.request.Request(url=url,headers=headers)
return request

def topic_get_content(request):
response=urllib.request.urlopen(request)
content=response.read().decode('utf-8')
return content

def download(content,page):
tree=etree.HTML(content)
# nickname_list=tree.xpath('//div[@class="content"]/p/@nick-name')
content_list=tree.xpath('//div[@class="content"]/p/text()')
book = Workbook()
sheet = book.active
filename=str(page)+"weibo.xlsx"
for i in (range(1,len(content_list))):
nickname_row='A'+str(i)
sheet[nickname_row]=content_list[i]
book.save(filename)


if __name__=='__main__':
start_page=int(input('please enter the start page:'))
end_page=int(input('please enter the end page:'))
for page in range(start_page,end_page+1):
request=topic_create_request(page)
content=topic_get_content(request)
download(content,page)