# -*- coding: utf-8 -*-
import concurrent
from concurrent.futures import ThreadPoolExecutor
import requests
import traceback
from bs4 import BeautifulSoup
import os
def header(referer):
headers = {
'Pragma': 'no-cache',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'Referer': '{}'.format(referer),
}
return headers
def request_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except requests.RequestException:
return None
def get_page_urls():
num = 0
urls = []
for i in range(1, 12):
baseurl = 'https://?pg={}'.format(i)
html = request_page(baseurl)
soup = BeautifulSoup(html, 'lxml')
list = soup.find_all(class_='responsive_thumb')
for item in list:
url = item.find('a').get('href')
num= num + 1
#print(num)
#print('页面链接:%s' % url)
urls.append(url)
return urls
def download(baseurl):
try:
html = request_page(baseurl)
soup = BeautifulSoup(html, 'lxml')
content = soup.find(id='mw-content-text')
name = soup.find('title').text
print('开始下载'+name+'.txt')
if os.path.exists(name+'.txt'):
print(name+".txt文件已存在")
else:
with open(name+'.txt','a',encoding='utf-8') as f:
f.write(content.text)
f.flush()
#print(content.text)
except Exception:
traceback.print_exc()
print('next one '+str(i))
def download_all(urls):
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as exector:
for url in list_page_urls:
exector.submit(download, url)
if __name__ == '__main__':
list_page_urls = get_page_urls()
print(len(list_page_urls))
#for url in list_page_urls:
# download(url)
download_all(list_page_urls)