Python-爬虫-懒得写的部分
requests
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import requests import re url = "" hd = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",} px = {"http":"http://127.0.0.1:8888"} # 代理 rst = requests.get(url, headers = hd) title = re.compile("<title>(.*?)</title>", re.S).findall(rst.text)
urllib
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import urllib import urllib.request import re import random # 浏览器伪装 opener = urllib.request.build_opener() UA = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36") opener.addheaders = [UA] urllib.request.install_opener(opener) url = "" data = urllib.request.urlopen(url).read().decode(‘utf-8‘, ‘ignore‘) # 构建用户代理池 uapools=[ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0" "" ] def UA(): opener = urllib.request.build_opener() thisua = random.choice(uapools) ua = ("User-Agent", thisua) opener.add_handlers = [ua] urllib.request.install_opener(opener) # print("当前使用UA:" + str(thisua)) for i in range(0, 10): UA() data = urllib.request.urlopen(url).read().decode(‘utf-8‘, ‘ignore‘)
范例
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import urllib.request import re import random import time uapools = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36" ] def UA(): opener = urllib.request.build_opener() thisua = random.choice(uapools) ua = ("User-Agent", thisua) opener.addheaders = [ua] urllib.request.install_opener(opener) print("当前使用UA: " + str(thisua)) for i in range(0, 35): # 总页数 UA() thisurl = "" # 构建 url try: data = urllib.request.urlopen(thisurl).read().decode(‘utf-8‘, ‘ignore‘) pat = ‘‘ # 构建正则 rst = re.compile(pat, re.S).findall(data) for j in range(0, len(rst)): # 打印 print(rst[j]) print("------") except Exception as err: pass
相关推荐
fangjack 2020-06-02
夜斗不是神 2020-11-17
染血白衣 2020-11-16
ARCXIANG 2020-11-02
ARCXIANG 2020-10-28
CycloneKid 2020-10-27
荒谬小孩 2020-10-26
逍遥友 2020-10-26
snakeson 2020-10-09
meylovezn 2020-08-28
囧芝麻 2020-08-17
数据挖掘工人 2020-08-15
cxcxrs 2020-07-28
dashoumeixi 2020-07-20
sunzhihaofuture 2020-07-19
我欲疾风前行 2020-07-06
sunzhihaofuture 2020-07-04