python如何处理掉12306的验证码

Python (199) 2023-06-19 07:06:51

本文实例讲述了Python实现破解12306图片验证码的方法。分享给大家供大家参考,具体如下:

不知从何时起,12306的登录验证码竟然变成了按字找图,可以说是又提高了一个等次,竟然把图像识别都用上了。不过有些图片,不得不说有些变态,图片的清晰图就更别说了,明显是从网络上的图库中搬过来的。

相关推荐:《Python基础教程》

谁知没多久,网络就惊现破解12306图片验证码的Python代码了,作为一个爱玩爱刺激的网虫,当然要分享一份过来。

代码大致流程:

1、将验证码图片下载下来,然后切图;

2、利用百度识图进行图片分析;

3、再利用正则表达式来取出百度识图的关键字,最后输出。

代码:

#!/usr/bin/python
##FileName:fuck12306.py
##Author:MaoMaoWang<andelf@gmail.com>
##Created:MonMar1622:08:412015byShuYuWang
##Copyright:Feather(c)2015
##Description:fuckfuck12306
##Time-stamp:<2015-03-1710:57:44andelf>
fromPILimportImage
fromPILimportImageFilter
importurllib
importurllib2
importre
importjson
#hackCERTIFICATE_VERIFY_FAILED
#https://github.com/mtschirs/quizduellapi/issues/2
importssl
ifhasattr(ssl,'_create_unverified_context'):
ssl._create_default_https_context=ssl._create_unverified_context
UA="Mozilla/5.0(Macintosh;IntelMacOSX10_10_2)AppleWebKit/537.36(KHTML,likeGecko)Chrome
/41.0.2272.89Safari/537.36"
pic_url="https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&0.21191171556711197"
defget_img():
resp=urllib.urlopen(pic_url)
raw=resp.read()
withopen("./tmp.jpg",'wb')asfp:
fp.write(raw)
returnImage.open("./tmp.jpg")
defget_sub_img(im,x,y):
assert0<=x<=3
assert0<=y<=2
WITH=HEIGHT=68
left=5+(67+5)*x
top=41+(67+5)*y
right=left+67
bottom=top+67
returnim.crop((left,top,right,bottom))
defbaidu_stu_lookup(im):
url="http://stu.baidu.com/n/image?fr=html5&needRawImageUrl=true&id=WU_FILE_0&name=233.png&type=
image%2Fpng&lastModifiedDate=Mon+Mar+16+2015+20%3A49%3A11+GMT%2B0800+(CST)&size="
im.save("./query_temp_img.png")
raw=open("./query_temp_img.png",'rb').read()
url=url+str(len(raw))
req=urllib2.Request(url,raw,{'Content-Type':'image/png','User-Agent':UA})
resp=urllib2.urlopen(req)
resp_url=resp.read()#returnapureurl
url="http://stu.baidu.com/n/searchpc?queryImageUrl="+urllib.quote(resp_url)
req=urllib2.Request(url,headers={'User-Agent':UA})
resp=urllib2.urlopen(req)
html=resp.read()
returnbaidu_stu_html_extract(html)
defbaidu_stu_html_extract(html):
#pattern=re.compile(r'<scripttype="text/javascript">(.*?)</script>',re.DOTALL|re.MULTILINE)
pattern=re.compile(r"keywords:'(.*?)'")
matches=pattern.findall(html)
ifnotmatches:
return'[UNKNOWN]'
json_str=matches[0]
json_str=json_str.replace('\\x22','"').replace('\\\\','\\')
#printjson_str
result=[item['keyword']foriteminjson.loads(json_str)]
return'|'.join(result)ifresultelse'[UNKNOWN]'
defocr_question_extract(im):
#git@github.com:madmaze/pytesseract.git
globalpytesseract
try:
importpytesseract
except:
print"[ERROR]pytesseractnotinstalled"
return
im=im.crop((127,3,260,22))
im=pre_ocr_processing(im)
#im.show()
returnpytesseract.image_to_string(im,).strip()
defpre_ocr_processing(im):
im=im.convert("RGB")
width,height=im.size
white=im.filter(ImageFilter.BLUR).filter(ImageFilter.MaxFilter(23))
grey=im.convert('L')
impix=im.load()
whitepix=white.load()
greypix=grey.load()
foryinrange(height):
forxinrange(width):
greypix[x,y]=min(255,max(255+impix[x,y][0]-whitepix[x,y][0],
255+impix[x,y][1]-whitepix[x,y][1],
255+impix[x,y][2]-whitepix[x,y][2]))
new_im=grey.copy()
binarize(new_im,150)
returnnew_im
defbinarize(im,thresh=120):
assert0<thresh<255
assertim.mode=='L'
w,h=im.size
foryinxrange(0,h):
forxinxrange(0,w):
ifim.getpixel((x,y))<thresh:
im.putpixel((x,y),0)
else:
im.putpixel((x,y),255)
if__name__=='__main__':
im=get_img()
#im=Image.open("./tmp.jpg")
print'OCRQuestion:',ocr_question_extract(im)
foryinrange(2):
forxinrange(4):
im2=get_sub_img(im,x,y)
result=baidu_stu_lookup(im2)
print(y,x),result
THE END

发表回复