python下载文本

python分享 (251) 2023-04-17 11:54:00
# -*- coding: utf-8 -*-
import concurrent
from concurrent.futures import ThreadPoolExecutor
import requests
import traceback
from bs4 import BeautifulSoup
import os

def header(referer):

    headers = {
        'Pragma': 'no-cache',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
        'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
        'Referer': '{}'.format(referer),
    }

    return headers


def request_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException:
        return None


def get_page_urls():
    num = 0
    urls = []
    for i in range(1, 12):
        baseurl = 'https://?pg={}'.format(i)
        html = request_page(baseurl)
        soup = BeautifulSoup(html, 'lxml')
        list = soup.find_all(class_='responsive_thumb')
        for item in list:
            url = item.find('a').get('href')
            num= num + 1
            #print(num)
            #print('页面链接:%s' % url)
            urls.append(url)
    return urls

def download(baseurl):
    try:
        html = request_page(baseurl)
        soup = BeautifulSoup(html, 'lxml')
        content = soup.find(id='mw-content-text')
        name = soup.find('title').text
        print('开始下载'+name+'.txt')
        if os.path.exists(name+'.txt'):
            print(name+".txt文件已存在")
        else:
            with open(name+'.txt','a',encoding='utf-8') as f:
                f.write(content.text)
                f.flush()
                #print(content.text)
    except Exception:
        traceback.print_exc()
        print('next one '+str(i))   

def download_all(urls):
    with concurrent.futures.ProcessPoolExecutor(max_workers=5) as exector:
        for url in list_page_urls:
            exector.submit(download, url)



if __name__ == '__main__':
    list_page_urls = get_page_urls()
    print(len(list_page_urls))
    #for url in list_page_urls:
    #    download(url)
    download_all(list_page_urls)

 

THE END

发表回复