beautiful soup案例

from bs4 import BeautifulSoup
import requests

headers = {
            "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36"}

url = 'https://news.sina.cn/gn/?from=wap'
web_data = requests.get(url,headers=headers)

# charset=utf-8表示当前内容的字符集是采用utf-8编码格式,我们需要用encoding来解锁下,否则的话会很多有乱码
web_data.encoding = 'utf-8'
soup = BeautifulSoup(web_data.text,'lxml')

# 找出所有class为news-item的元素,class名前面需要加点(.)
for news in soup.select('section section'):
    # 通过观察代码,可以看到标题被储存在标签h2中,如果h2的长度大于0,这里是为了去除为空的标题数据
    if(len(news.select('h2')) > 0):
        # 将news.select('h2')[0].text存储在变量title中
        title = news.select('h2')[0].text
        #新闻图片链接
        img_elm = news.select('img')
        image = 'https:'+img_elm[0]['data-src'] if img_elm else 'null'

        # 我们要抓取的链接存放在a标签中,链接已经不是text了,后面用href,将链接数据存储在变量link中
        link = news.select('a')[0]['href']
        #新闻来源
        form_elm = news.select('cite')
        form_info = form_elm[0].text if form_elm else 'null'
        print(title,image,link,form_info)
CopyRight©2019 ChenYangchen all right reserved,powered by Gitbook该文件修订时间: 2019-06-17 00:42:52