看kindle网站电子书用Python爬取下载

Python (239) 2023-04-15 10:36:26

一个下载看kindle(kankindle.com)的所有电子书的python脚本,程序会自动下载首页部分13页的所有电子书,下载到ebook目录下,程序会检测是否下载过。

#!/usr/bin/envpython
#coding=utf-8
frombs4importBeautifulSoup
importurllib2
importsocket
importre
importunicodedata
importos
fromurwid.text_layoutimporttrim_line
defdownload(url):
print'startingdownload%s'%url
response=urllib2.urlopen(url,timeout=30)
html_data=response.read()

soup=BeautifulSoup(html_data)
print'starttoanalayse---------------'


title_soup=soup.find_all(class_='yanshi_xiazai')
name_soup=soup.find_all('h1')
tag_a=title_soup[0].a.attrs['href']
tag_name=title_soup[0].a.contents
link_name=name_soup[0]
link_name=str(link_name).replace("<h1>","").replace("</h1>","")
#printtag_name[0]
#printlink_name


filename=link_name+".mobi"
filename="ebook/"+filename
print'filenameis:%s'%filename

print"downloadingwithurllib2%s"%tag_a
ifos.path.exists(filename):
print'alreadydonwload,ignore'
else:
try:
f=urllib2.urlopen(tag_a,timeout=60)
data=f.read()
#print'thedatais%s'%data
withopen(filename,"wb")ascode:
code.write(data)
exceptException,e:
printe
defget_all_link(url):
print'Startinggetallthelist'
response=urllib2.urlopen(url,timeout=30)
html_data=response.read()
#printhtml_data

soup=BeautifulSoup(html_data)
link_soup=soup.find_all('a')
#printlink_soup

foreach_linkinlink_soup:
ifre.search('view',str(each_link)):
#printeach_link
printeach_link
printeach_link.attrs['href']
download(each_link.attrs['href'])
if__name__=='__main__':
forpageinrange(1,13):
url="http://kankindle.com/simple/page/3"+str(page)
url=url.strip()
printurl
get_all_link(url)
THE END

发表回复