全国各个省市地区

介绍


- 工具为Beautifulsoup([这里是介绍](https://www.cassi.top/article/3 "这里是介绍")),改用requests获取html

问题


  • 编码问题,提取数据后汉字为乱码。所以在解码时添加了参数from_encoding='gbk'
    soup = BeautifulSoup(raw_html, 'lxml', from_encoding='gbk')
  • 异常。请求网页时会出现runtime out,并未处理。而且请求时间较长,应该是同一个问题,待以后修改。

代码


# -*- coding:utf-8 -*-
import requests
import lxml
import json
from bs4 import BeautifulSoup

route_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/'


def get_html(url=None):
    if url is None:
        return None

    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'

    headers = {'User-Agent': user_agent}
    response = requests.get(url, headers=headers)
    return response.content


def get_all_province():
    index_url = route_url + '/index.html'

    raw_html = get_html(index_url)
    soup = BeautifulSoup(raw_html, 'lxml', from_encoding='gbk')
    province = {}
    province_tr = soup.find_all('tr', class_='provincetr')
    for province_td_list in province_tr:
        for province_td in province_td_list.find_all('td'):
            province_a = province_td.find('a')
            p = province_a.get_text()   # province
            p_href = province_a['href']

            # 市区url
            city_url = route_url + p_href
            city = get_all_city(city_url)
            province[p] = city
    return province


def get_all_city(url=None, is_list=False):
    raw_html = get_html(url)
    soup = BeautifulSoup(raw_html, 'lxml', from_encoding='gbk')
    if is_list:
        city = []
    else:
        city = {}
    city_tr_list = soup.find_all('tr', class_='citytr')
    for city_tr in city_tr_list:
        area_code, city_a = city_tr.find_all('a')
        city_name = city_a.get_text()
        city_href = city_a['href']
        # city_area_code = area_code.get_text()     # 区域行政编码
        if is_list:
            city.append(city_name)
        else:

            # 县区url
            country_url = route_url + city_href
            country = get_all_county(country_url, True)
            city[city_name] = country

    return city


def get_all_county(url=None, is_list=False):
    raw_html = get_html(url)
    soup = BeautifulSoup(raw_html, 'lxml', from_encoding='gbk')
    if is_list:
        county = []
    else:
        county = {}
    county_tr_list = soup.find_all('tr', class_='countytr')
    for county_tr in county_tr_list:
        area_code, county_td = county_tr.find_all('td')
        county_a = county_td.find('a')
        if county_a:
            county_name = county_a.get_text()     # country
            county_href = county_a['href']
        else:
            continue

        if is_list:
            county.append(county_name)
        else:
            county[county_name] = county_href

    return county


def main():
    a = get_all_province()
    output = open('result.txt', 'a')
    result_str = json.dumps(a, ensure_ascii=False)
    output.write(result_str)
    output.close()


if __name__ == '__main__':
    main()


结果