python結合requests和selenium实现页面的访问,地址的记录等

python分享 (256) 2023-04-17 11:57:37
# -*- coding: utf-8 -*

from time import sleep, time
from selenium import webdriver
from requests.packages import urllib3
import requests, traceback
import re
import os
from bs4 import BeautifulSoup


# verify参数控制是否检查证书(默认是ture),通过设置忽略屏蔽警告
urllib3.disable_warnings()

headers = {
"cookie": "varify_key=kisspng; fotCookie=1; __gads=ID=b691421f54f0f43e-22b06658ecd00005:T=1647050530:RT=1647050530:S=ALNI_Map1ltTfpamxwCcGyFh1N5NFdUUnw; __atuvc=2%7C10%2C1%7C11; _gid=GA1.2.1245356425.1647615079; _gat_gtag_UA_193347727_2=1; _ga_WR5JC9XF6P=GS1.1.1647615078.7.1.1647615100.0; _ga=GA1.2.748602746.1647050410",
"referer": "https://www.****.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36"
}

def getHTMLText(url):
    r = requests.get(url,headers=headers,timeout=30,verify=False)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text	

def writeText(name='H:\\pngurl.txt', text='\n'):
    with open(name, 'a+', encoding="utf-8") as txt:  # 存储路径里的文件夹需要事先创建。
        txt.write(text)
    txt.close()

def formdata():
    params = {"dwagain": "ezL3icHsQsojeJ9sOp4rcGBub7ApWrIyP2onRJR9aX3oMou4VsQ4PmI9SaYEfR=="}
    requests.request("POST",url='',files=params)

if __name__ == '__main__':
    #options=webdriver.ChromeOptions()
    # 忽略无用的日志
    #options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
    #driver=webdriver.Chrome(chrome_options=options)
    base_url = 'https://www.****.com'
    download_url = 'download-png.html'
    urls = ['https://www.***.com/free/watercolor,{}.html'.format(str(i)) for i in range(1,61)]
    for url in urls:
        print(url)
        html = getHTMLText(url)
        #print(html)
        soup_p = BeautifulSoup(html,'lxml')
        tab = soup_p.find('ul',class_='list-four-ul')
        hrefs = tab.find_all('article')
        #print(len(hrefs))
        num = 0
        for href in hrefs:
            num= num + 1
            one_url = href.a.get('href')
            print(base_url + one_url)         
            downloadurl = base_url + one_url + download_url
            writeText(text=download_url+'\n')
            #driver.get(base_url + one_url + download_url)
            #sleep(20)
            #if num>3:
            #    break
        sleep(20)
    #driver.quit()

 

THE END

发表回复