Beautiful Soup

Beautiful Soup 4 通过PyPi发布 可以直接通过pip安装

pip install beautifulsoup4

解析库可以使用python自带的解析库。常用的是lxml,这里使用lxml

pip install lxml

本来想用豆瓣的页面,直接给我屏蔽了,所以换成imdb的top250电影地址 imdbTop250 movie 首先编辑python文件,引入需要的包

# -*- coding:utf-8 -*-
import lxml
import urllib2
from bs4 import BeautifulSoup

然后使用urllib2获取电影页面,添加以下header中的参数

def start():

	url = 'https://www.imdb.com/chart/top?ref_=nv_mv_250';
	
	request = urllib2.Request(url)
	request.add_header('User-agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36')
	request.add_header('Accetp-Encoding','gzip')
	request.add_header('referer','https://www.imdb.com/')
	
	response = urllib2.urlopen(request)
	html = response.read() 

用浏览器查看以后发现电影的数据文件都放在 calss属性为lister-list的tbody中 imdb_top250 下面用bs解析页面并获取tbody

soup = BeautifulSoup(html,'lxml')
tbody = soup.find(calss_='lister-list')

我们只获取title 以及rate。所以只抓取对应的td数据就可以了。直接用string就可以获取标签中的内容 title_rate

for tr in tbody.find_all('tr'):
	name = tr.find('td',class_='titleColumn').a.string
	#年份的括号看起来很不舒服 用正则直接去掉 注意引入re包
	yearTemp = tr.find('td',class_='titleColumn').span.string
	year = re.match(r'\((\d+)\)',yearTemp)
	rate = tr.find('td',class_='ratingColumn').strong.string
	print  name.ljust(70)  + '  ' + rate + '  '+ year.group(1)

输出结果如下 result 也可写入文件

		movieString = name.ljust(70) + '  ' + rate + '  '+ year.group(1) +'\n'
		a = open("movie.log","a")
		a.write(movieString)
		a.close()

写入文件时遇到报错字符集的问题 UnicodeEncodeError,使用以下代码改变字符集

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

完整代码

# -*- coding:utf-8 -*-
import lxml
import urllib2
import re
from bs4 import BeautifulSoup
import sys
reload(sys)

sys.setdefaultencoding('utf-8')



def start():


	url = 'https://www.imdb.com/chart/top?ref_=nv_mv_250';

	request = urllib2.Request(url)

	request.add_header('User-agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36')

	request.add_header('Accetp-Encoding','gzip')

	request.add_header('referer','https://www.imdb.com/')

	response = urllib2.urlopen(request)

	html = response.read() 

	soup = BeautifulSoup(html,'lxml')

	tbody = soup.find(class_='lister-list')



	for tr in tbody.find_all('tr'):


		name = tr.find('td',class_='titleColumn').a.string;
		yearTemp = tr.find('td',class_='titleColumn').span.string;
		year = re.match(r'\((\d+)\)',yearTemp);
		rate = tr.find('td',class_='ratingColumn').strong.string;


		movieString = name.ljust(70) + '  ' + rate + '  '+ year.group(1) +'\n'
		a = open("movie.log","a")
		a.write(movieString)
		a.close()

if __name__ == '__main__':
	start()