Xpath 案例
百度贴吧爬虫
import urllib.parse
import urllib.request
from lxml import etree
class Imagespider:
    def __init__(self):
        self.tiebaname = input("请输入需要爬取的贴吧名:")
        self.beginPage = int(input("请输入起始页:"))
        self.endPage = int(input("请输入结束页:"))
        self.url = "http://tieba.baidu.com/f?"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    def loadPage(self, url):
        html = urllib.request.urlopen(url).read()
        content = etree.HTML(html)
        link_list = content.xpath('//div[@class="t_con cleafix"]/div[2]/div[1]/div[1]/a/@href')
        for link in link_list:
            fulllink = "http://tieba.baidu.com" + link
            self.loadImage(fulllink)
    def loadImage(self, link):
        request = urllib.request.Request(link, headers=self.headers)
        html = urllib.request.urlopen(request).read()
        content = etree.HTML(html)
        link_list = content.xpath('//img[@class="BDE_Image"]/@src')
        for link in link_list:
            self.writeImage(link)
    def writeImage(self, link):
        request = urllib.request.Request(link, headers=self.headers)
        image = urllib.request.urlopen(request).read()
        
        filename = link[-10:]
        with open(filename, "wb") as f:
            f.write(image)
        print("已经成功下载 " + filename)
    def tiebaSpider(self):
        for page in range(self.beginPage, self.endPage + 1):
            pn = (page - 1) * 50
            key = urllib.parse.urlencode({'pn': pn, "kw": self.tiebaname})
            fullurl = self.url + key
            self.loadPage(fullurl)
            print("谢谢使用")
if __name__ == "__main__":
    mySpider = Imagespider()
    mySpider.tiebaSpider()
糗事百科案例
import requests
from lxml import etree
class Qiubai_spider():
    def __init__(self):
        self.url = "http://www.qiushibaike.com/text/page/"
        self.headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
        }
    def parse_url(self,url):
        response = requests.get(url,timeout=10,headers=self.headers) 
        return etree.HTML(response.text) 
    def parse_content(self,html):
        item_temp = html.xpath('//div[@id="content-left"]/div')
        print(len(item_temp))
        for item in item_temp:
            
            avatar = item.xpath("./div[1]//img/@src")[0]
            
            if  not avatar.startswith("http:"):
                avatar = "http:"+avatar
            name = item.xpath("./div[1]//h2/text()")[0] 
            content = item.xpath(".//div[@class='content']/span")[0].xpath('string(.)') 
            star_number = item.xpath('./div[@class="stats"]//i/text()')[0]
            comment_number = item.xpath("./div[@class='stats']/span[2]/a/i/text()")[0]
            yield  {
                'author':name,
                'content':content,
                'star':star_number,
                'comment':comment_number,
                'avatar':avatar
            }
    def run(self):
        '''函数的主要逻辑实现
        '''
        page_num = input('请输入要爬取的页码')
        url = self.url+page_num 
        html = self.parse_url(url) 
        with open('xx.txt','a',encoding='utf-8') as f:
            for item in self.parse_content(html): 
                f.write(str(item)+'\n')
if __name__ == "__main__":
    qiubai = Qiubai_spider()
    qiubai.run()