Xpath 案例
百度贴吧爬虫
import urllib.parse
import urllib.request
from lxml import etree
class Imagespider:
def __init__(self):
self.tiebaname = input("请输入需要爬取的贴吧名:")
self.beginPage = int(input("请输入起始页:"))
self.endPage = int(input("请输入结束页:"))
self.url = "http://tieba.baidu.com/f?"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
def loadPage(self, url):
html = urllib.request.urlopen(url).read()
content = etree.HTML(html)
link_list = content.xpath('//div[@class="t_con cleafix"]/div[2]/div[1]/div[1]/a/@href')
for link in link_list:
fulllink = "http://tieba.baidu.com" + link
self.loadImage(fulllink)
def loadImage(self, link):
request = urllib.request.Request(link, headers=self.headers)
html = urllib.request.urlopen(request).read()
content = etree.HTML(html)
link_list = content.xpath('//img[@class="BDE_Image"]/@src')
for link in link_list:
self.writeImage(link)
def writeImage(self, link):
request = urllib.request.Request(link, headers=self.headers)
image = urllib.request.urlopen(request).read()
filename = link[-10:]
with open(filename, "wb") as f:
f.write(image)
print("已经成功下载 " + filename)
def tiebaSpider(self):
for page in range(self.beginPage, self.endPage + 1):
pn = (page - 1) * 50
key = urllib.parse.urlencode({'pn': pn, "kw": self.tiebaname})
fullurl = self.url + key
self.loadPage(fullurl)
print("谢谢使用")
if __name__ == "__main__":
mySpider = Imagespider()
mySpider.tiebaSpider()
糗事百科案例
import requests
from lxml import etree
class Qiubai_spider():
def __init__(self):
self.url = "http://www.qiushibaike.com/text/page/"
self.headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
def parse_url(self,url):
response = requests.get(url,timeout=10,headers=self.headers)
return etree.HTML(response.text)
def parse_content(self,html):
item_temp = html.xpath('//div[@id="content-left"]/div')
print(len(item_temp))
for item in item_temp:
avatar = item.xpath("./div[1]//img/@src")[0]
if not avatar.startswith("http:"):
avatar = "http:"+avatar
name = item.xpath("./div[1]//h2/text()")[0]
content = item.xpath(".//div[@class='content']/span")[0].xpath('string(.)')
star_number = item.xpath('./div[@class="stats"]//i/text()')[0]
comment_number = item.xpath("./div[@class='stats']/span[2]/a/i/text()")[0]
yield {
'author':name,
'content':content,
'star':star_number,
'comment':comment_number,
'avatar':avatar
}
def run(self):
'''函数的主要逻辑实现
'''
page_num = input('请输入要爬取的页码')
url = self.url+page_num
html = self.parse_url(url)
with open('xx.txt','a',encoding='utf-8') as f:
for item in self.parse_content(html):
f.write(str(item)+'\n')
if __name__ == "__main__":
qiubai = Qiubai_spider()
qiubai.run()