分享一个爬取HUST(哈理工)学生成绩的Python程序(OCR自动识别验证码)

BitTigerio

2018-01-21

Python版本：3.5.2
日期：2018/1/21

__Author__ = "Lance#"

# -*- coding = utf-8 -*-

from urllib import request
from urllib import parse
from http import cookiejar
from aip.ocr import AipOcr
import re

class Hust(object):
    def __init__(self, stu_id, passwd):
        #登录地址，验证码地址，成绩查询地址
        self.__url_check = "http://jwzx.hrbust.edu.cn/academic/getCaptcha.do"
        self.__url_login = "http://jwzx.hrbust.edu.cn/academic/j_acegi_security_check"
        self.__url_scoal = "http://jwzx.hrbust.edu.cn/academic/manager/score/studentOwnScore.do"
        #信息头，模拟浏览器
        self.__headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0"
        }

        self.__captcha = ''
        #这里是自已在AI中申请到的ID和KEY
        self.__APP_ID = 'xxxxxx'
        self.__API_KEY = 'xxxxxx'
        self.__SECRET_KEY = 'xxxxxx'

        #参数信息，在浏览器中可以捕获
        self.__post_data = {
            "groupId": "",
            "j_username": stu_id,
            "j_password": passwd,
            "j_captcha" : ''
        }

        ##声明一个CookieJar对象实例
        self.__cookie = cookiejar.CookieJar()
        #利用HTTPCookieProcessor对象来创建cookie处理器
        self.__cookieProc = request.HTTPCookieProcessor(self.__cookie)
        # 通过handler来构建opener
        self.__opener = request.build_opener(self.__cookieProc)
        #安装使用这个opener
        request.install_opener(self.__opener)

    def ocr_captcha(self):
        '''ocr识别验证码'''

        Req = request.Request(self.__url_check, headers=self.__headers)
        captcha = request.urlopen(Req).read()

        #AI的接口函数
        client = AipOcr(self.__APP_ID, self.__API_KEY, self.__SECRET_KEY)
        res = client.basicGeneral(captcha)
        self.__captcha = res['words_result'][0]['words']

    def get_captcha(self):
        '''得到验证码'''

        return self.__captcha

    def set_postdata(self):
        '''设置要发送的参数，就是修改验证码'''

        self.__post_data["j_captcha"] = self.__captcha

    def login(self):
        '''模拟登录'''

        #urlencode的作用：将字符串以URL编码，用于编码处理
        data = parse.urlencode(self.__post_data).encode()
        Req = request.Request(self.__url_login, headers=self.__headers)
        html = request.urlopen(Req, data=data)
        #登录页采用的是GBK编码，这个需要注意
        return html.read().decode("GBK")

    def get_scoal(self):
        '''获取到成绩信息，并用正则分解'''

        Req = request.Request(self.__url_scoal, headers=self.__headers)
        res = request.urlopen(Req).read().decode()

        #解析HTML采用的正则表达式
        pat = re.compile('<td>(.*?)</td>', re.S)
        list = re.findall(pat, res)

        #对采集到的数据进行整理
        for i, con in enumerate(list):
            list[i] = con.replace("\n        ", "")

        return list

    def display(self, list):
        '''显示成绩信息'''

        cnt = len(list)
        new_list = []
        cnt -= 3
        y = int(cnt / 13)

        for m in range(y):
            new_list.insert(m, [list[j] for j in range(3 + m * 13, 16 + m * 13)])

        print("学年   学期   及格标志    分数       学分           课程名")

        for item in new_list:
            print("{}   {}    {:>5s}      {:5s}    {:^5s}  {:^20s}".format(
                item[0], item[1], item[12], item[6].replace('', "").replace("", ""),
                item[7], item[3]))

if __name__ == '__main__':
    cnt = 1
    err_str = "输入的验证码不正确！"

    #此处是自己的学号和密码
    stu = Hust("xxxxxx", "xxxxxx")
    while True:
        stu.ocr_captcha()
        print("识别到的验证码为: %s     ------      " % stu.get_captcha(), end="")
        stu.set_postdata()
        html = stu.login()
        if err_str not in html:
            print("验证码正确")
            break
        cnt += 1
        print("验证码错误，启动第%d次识别" % cnt)
    print()
    print("Scoal Info".center(70, "-"))
    list = stu.get_scoal()
    stu.display(list)
    print("End".center(70, "-"))

完成效果图：
分享一个爬取HUST(哈理工)学生成绩的Python程序(OCR自动识别验证码)