# -*- coding: utf-8 -*- import concurrent from concurrent.futures import ThreadPoolExecutor import requests import traceback from bs4 import BeautifulSoup import os def header(referer): headers = { 'Pragma': 'no-cache', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36', 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 'Referer': '{}'.format(referer), } return headers def request_page(url): try: response = requests.get(url) if response.status_code == 200: return response.text except requests.RequestException: return None def get_page_urls(): num = 0 urls = [] for i in range(1, 12): baseurl = 'https://?pg={}'.format(i) html = request_page(baseurl) soup = BeautifulSoup(html, 'lxml') list = soup.find_all(class_='responsive_thumb') for item in list: url = item.find('a').get('href') num= num + 1 #print(num) #print('页面链接:%s' % url) urls.append(url) return urls def download(baseurl): try: html = request_page(baseurl) soup = BeautifulSoup(html, 'lxml') content = soup.find(id='mw-content-text') name = soup.find('title').text print('开始下载'+name+'.txt') if os.path.exists(name+'.txt'): print(name+".txt文件已存在") else: with open(name+'.txt','a',encoding='utf-8') as f: f.write(content.text) f.flush() #print(content.text) except Exception: traceback.print_exc() print('next one '+str(i)) def download_all(urls): with concurrent.futures.ProcessPoolExecutor(max_workers=5) as exector: for url in list_page_urls: exector.submit(download, url) if __name__ == '__main__': list_page_urls = get_page_urls() print(len(list_page_urls)) #for url in list_page_urls: # download(url) download_all(list_page_urls)