爬取素材库直接存入mysql数据库

爬取素材库。直接存入mysql数据库。
包含html源码直接存入数据库需要的转义函数。
替换掉源码中的html注释语句

import re
import requests
import random
import time
from bs4 import BeautifulSoup
import pymysql

#html源码进数据库,转义函数
def transferContent(content):
    if content is None:
        return None
    else:
        stri = ""
        for c in content:
            if c == ‘"‘:
                stri += c.replace(‘"‘, ‘\\\"‘)
            elif c == "‘":
                stri += c.replace("‘", "\\\‘")
            elif c == "\\":
                stri += "\\\\"
            else:
                stri += str(c)
    return stri

user_agent_list = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
UA = random.choice(user_agent_list)  ##从self.user_agent_list中随机取出一个字符串
headers = {‘User-Agent‘: UA}  ##构造成一个完整的User-Agent (UA代表的是上面随机取出来的字符串哦)

# 连接database
conn =pymysql.connect(user=‘root‘, password=‘1234‘ ,host=‘127.0.0.1‘,database=‘sucai‘)


url = ‘https://www.***.com/***_0_‘

for x in range(26, 724):
    time.sleep(random.randint(1, 5))

    with requests.get(url + str(x), headers=headers, timeout=5) as response:

        soup = BeautifulSoup(response.text, ‘lxml‘)

        li_list = soup.find_all(‘div‘, class_=‘material-item‘)

        for li_quick in li_list:

            # 源码
            item_code = li_quick.find(‘div‘, class_=‘item-code‘).decode_contents() #转换为字符串 decode_contents(不含当前标签)

            re_comment=re.compile(‘<![^>]*>‘)#HTML注释
            item_code = re_comment.sub(‘‘, item_code)  # 去掉HTML注释
            item_code = transferContent(item_code.strip())  #去掉前后空格

            # 简介
            item_info = li_quick.find(‘div‘, class_=‘item-bottom‘).find(‘div‘, class_=‘item-info‘).a.get_text().strip()


            # lable
            item_label = li_quick.find(‘div‘, class_=‘item-bottom‘).find(‘div‘, class_=‘item-label‘)
            item_label_0=‘‘
            for kj in item_label.find_all(‘span‘):
                item_label_0+=kj.get_text()+‘;‘

            # 创建游标
            cursor = conn.cursor()

            # --insert---

            effect_rows = cursor.execute(‘insert into sucaix (ye,rowcode,info,lablex) values (%s,%s,%s,%s)‘,[str(x),item_code,item_info,item_label_0])
            # 提交任务
            conn.commit()
            print(x)
            # 关闭游标
            cursor.close()
conn.close()

相关推荐