本文实例讲述了Python实现破解12306图片验证码的方法。分享给大家供大家参考,具体如下:
不知从何时起,12306的登录验证码竟然变成了按字找图,可以说是又提高了一个等次,竟然把图像识别都用上了。不过有些图片,不得不说有些变态,图片的清晰图就更别说了,明显是从网络上的图库中搬过来的。
相关推荐:《Python基础教程》
谁知没多久,网络就惊现破解12306图片验证码的Python代码了,作为一个爱玩爱刺激的网虫,当然要分享一份过来。
代码大致流程:
1、将验证码图片下载下来,然后切图;
2、利用百度识图进行图片分析;
3、再利用正则表达式来取出百度识图的关键字,最后输出。
代码:
#!/usr/bin/python ##FileName:fuck12306.py ##Author:MaoMaoWang<andelf@gmail.com> ##Created:MonMar1622:08:412015byShuYuWang ##Copyright:Feather(c)2015 ##Description:fuckfuck12306 ##Time-stamp:<2015-03-1710:57:44andelf> fromPILimportImage fromPILimportImageFilter importurllib importurllib2 importre importjson #hackCERTIFICATE_VERIFY_FAILED #https://github.com/mtschirs/quizduellapi/issues/2 importssl ifhasattr(ssl,'_create_unverified_context'): ssl._create_default_https_context=ssl._create_unverified_context UA="Mozilla/5.0(Macintosh;IntelMacOSX10_10_2)AppleWebKit/537.36(KHTML,likeGecko)Chrome /41.0.2272.89Safari/537.36" pic_url="https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&0.21191171556711197" defget_img(): resp=urllib.urlopen(pic_url) raw=resp.read() withopen("./tmp.jpg",'wb')asfp: fp.write(raw) returnImage.open("./tmp.jpg") defget_sub_img(im,x,y): assert0<=x<=3 assert0<=y<=2 WITH=HEIGHT=68 left=5+(67+5)*x top=41+(67+5)*y right=left+67 bottom=top+67 returnim.crop((left,top,right,bottom)) defbaidu_stu_lookup(im): url="http://stu.baidu.com/n/image?fr=html5&needRawImageUrl=true&id=WU_FILE_0&name=233.png&type= image%2Fpng&lastModifiedDate=Mon+Mar+16+2015+20%3A49%3A11+GMT%2B0800+(CST)&size=" im.save("./query_temp_img.png") raw=open("./query_temp_img.png",'rb').read() url=url+str(len(raw)) req=urllib2.Request(url,raw,{'Content-Type':'image/png','User-Agent':UA}) resp=urllib2.urlopen(req) resp_url=resp.read()#returnapureurl url="http://stu.baidu.com/n/searchpc?queryImageUrl="+urllib.quote(resp_url) req=urllib2.Request(url,headers={'User-Agent':UA}) resp=urllib2.urlopen(req) html=resp.read() returnbaidu_stu_html_extract(html) defbaidu_stu_html_extract(html): #pattern=re.compile(r'<scripttype="text/javascript">(.*?)</script>',re.DOTALL|re.MULTILINE) pattern=re.compile(r"keywords:'(.*?)'") matches=pattern.findall(html) ifnotmatches: return'[UNKNOWN]' json_str=matches[0] json_str=json_str.replace('\\x22','"').replace('\\\\','\\') #printjson_str result=[item['keyword']foriteminjson.loads(json_str)] return'|'.join(result)ifresultelse'[UNKNOWN]' defocr_question_extract(im): #git@github.com:madmaze/pytesseract.git globalpytesseract try: importpytesseract except: print"[ERROR]pytesseractnotinstalled" return im=im.crop((127,3,260,22)) im=pre_ocr_processing(im) #im.show() returnpytesseract.image_to_string(im,).strip() defpre_ocr_processing(im): im=im.convert("RGB") width,height=im.size white=im.filter(ImageFilter.BLUR).filter(ImageFilter.MaxFilter(23)) grey=im.convert('L') impix=im.load() whitepix=white.load() greypix=grey.load() foryinrange(height): forxinrange(width): greypix[x,y]=min(255,max(255+impix[x,y][0]-whitepix[x,y][0], 255+impix[x,y][1]-whitepix[x,y][1], 255+impix[x,y][2]-whitepix[x,y][2])) new_im=grey.copy() binarize(new_im,150) returnnew_im defbinarize(im,thresh=120): assert0<thresh<255 assertim.mode=='L' w,h=im.size foryinxrange(0,h): forxinxrange(0,w): ifim.getpixel((x,y))<thresh: im.putpixel((x,y),0) else: im.putpixel((x,y),255) if__name__=='__main__': im=get_img() #im=Image.open("./tmp.jpg") print'OCRQuestion:',ocr_question_extract(im) foryinrange(2): forxinrange(4): im2=get_sub_img(im,x,y) result=baidu_stu_lookup(im2) print(y,x),result
下一篇