Python 实现简单的爬虫

四叶草

2020-01-07

Python 是一种跨平台的计算机程序设计语言，面向对象动态类型语言，Python是纯粹的自由软件,源代码和解释器CPython遵循 GPL(GNU General Public License)协议，随着版本的不断更新和语言新功能的添加，Python 越来越多被用于独立的、大型项目的开发。

快速抓取网页: 使用urllib最基本的抓取功能,将百度首页的内容保存到本地目录下.

>>> import urllib.request
>>>
>>> res=urllib.request.urlopen("https://www.baidu.com")
>>> print(res.read().decode("utf-8"))

>>> f=open("./test.html","wb")      #保存在本地
>>> f.write(res.read())
>>> f.close()

实现POST请求: 上述的例子是通过请求百度的get请求获得百度,下面使用urllib的post请求.

>>> import urllib.parse
>>> import urllib.request
>>>
>>> data=bytes(urllib.parse.urlencode({"hello":"lyshark"}),encoding="utf-8")
>>> print(data)
>>> response = urllib.request.urlopen('http://www.baidu.com/post',data=data)
>>> print(response.read())

设置TIMEOUT时间: 我们需要给请求设置一个超时时间,而不是让程序一直在等待结果.

import urllib.request

response = urllib.request.urlopen('http://www.baidu.com', timeout=1)
print(response.read())

获取网站状态: 我们可以通过status、getheaders(),getheader("server"),获取状态码以及头部信息.

>>> import urllib.request
>>>
>>> res=urllib.request.urlopen("https://www.python.org")
>>> print(type(res))
<class 'http.client.HTTPResponse'>
>>>
>>> res.status
>>> res.getheaders()
>>> res.getheader("server")

伪装访问网站: 给请求添加头部信息,从而定制自己请求网站是时的头部信息,防止被和谐.

from urllib import request,parse

url = 'http://www.baidu.com'
headers = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
    'Host': 'mkdirs.org'
}
dict = {
    'name': 'LyShark'
}
data = bytes(parse.urlencode(dict), encoding='utf8')
req = request.Request(url=url, data=data, headers=headers, method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))

简单的URL页面拼接:

import re

def Get_Url(target,start,ends):
    urls=[]
    for i in range(start,ends):
        url = target+"/"+str(i)
        urls.append(url)
    return urls

if __name__ == "__main__":
    url = Get_Url("https://www.mzitu.com/214261",1,10)
    print(url)

request库的使用:

import re
import requests

head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

if __name__ == "__main__":
    ret = requests.get(url="https://www.mzitu.com/214261", headers=head, timeout=1)
    all_pic_link = re.findall('<img src="(.*?)"', ret.text, re.S)
    print(all_pic_link)

简单实现爬取图片:

import re
import urllib.request

def open_url(url):
    ret = urllib.request.Request(url)
    ret.add_header('user-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36')
    page = urllib.request.urlopen(ret)
    html =page.read().decode("utf-8")
    return html

def get_img(html):
    ret = re.findall('<img src="([^"]+\.jpg)"',html)
    for each in ret:
        filename = each.split("/")[-1]
        print("完整路径:",each)
        print("文件名称:",filename)
        urllib.request.urlretrieve(each,filename,None)

if __name__ == '__main__':
    url = open_url("https://www.mzitu.com/210402")
    get_img(url)

爬每日CVE漏洞列表:

import re
import requests
from bs4 import BeautifulSoup

head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

def Get_CVE(url):
    new_cve = []
    ret = requests.get(url=url, headers=head, timeout=3)
    bs = BeautifulSoup(ret.text, 'html.parser')
    for i in bs.find_all('a'):
        href = i.get('href')
        new_cve.append(href)
    return(new_cve)

def Get_Number(list):
    new = []
    for i in list:
        temp = re.findall("[0-9]{1,}-.*", str(i))
        new.append("CVE-{}".format(temp))
    return new

if __name__ == "__main__":
    url= "https://cassandra.cerias.purdue.edu/CVE_changes/today.html"
    cve = Get_CVE(url)
    number = Get_Number(cve)
    for i in number:
        print("今日份的漏洞:",i)

简单爬取西刺代理地址: 此处我们就用简单的正则匹配爬取,该方法比较笨拙.

import re
import requests

head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="https://www.xicidaili.com/nn/1", headers=head, timeout=3)
data = re.findall('<td>.*</td>', ret.text)
sum =0
for i in range(0,20):
    IP = data[sum].replace("<td>","").replace("</td>","")
    Port = data[sum+1].replace("<td>","").replace("</td>","")
    Type = data[sum+2].replace("<td>","").replace("</td>","")
    times = data[sum+3].replace("<td>","").replace("</td>","")
    year = data[sum+4].replace("<td>","").replace("</td>","")
    print("IP地址:{} 端口号:{} 类型:{} 生存周期:{} 时间:{}".format(IP,Port,Type,times,year))
    sum = sum+5

BeautifulSoup 定位技巧: 使用bs库需要安装,三个依赖包 pip install requests bs4 lxml

from bs4 import BeautifulSoup
import requests

head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="https://lyshark.cnblogs.com", headers=head, timeout=3)
ret.encoding="utf-8"             # 出现乱码需要改这里
bs = BeautifulSoup(ret.text,"lxml")

# 查找head头节点里面的所有link标签,过滤出0个里面的,href成员
 print(bs.select('head link')[0]['href'])

# 查找文中所有a标签,且类名是c_b_p_desc_readmore的,并提取出其href字段
 print(bs.find_all('a',class_='c_b_p_desc_readmore')[0]['href'])

# 提取所有a标签，且id等于blog_nav_admin类等于menu，并提取出其href字段
 print(bs.find_all('a',id='blog_nav_admin',class_='menu')[0]['href'])
 print(bs.find_all('a',id='blog_nav_admin',class_='menu')[0].attrs['href'])

# 提取DIV标签里面,id是page_begin_html且里面是link标签的
 print(bs.select('div[id="page_begin_html"] link')[0]['href'])
 print(bs.select('ul[id="navList"] .menu')[0]['href'])

 # 提取 body 标签下面的 div标签并且匹配id=page_begin_html标签里面第1个link元素
 print(bs.select('body > div[id="page_begin_html"] > link')[0])

# 提取指定标签里面的内容
 print(bs.select('title')[0].get_text())
 print(bs.select('a[href="https://www.cnblogs.com/LyShark/archive/2019/12/04.html"]'))

 # 定位body标签下面的div下面子标签div下面的span标签
 print(bs.select('div[id="header"] div[id="blogTitle"] a[id="lnkBlogLogo"]'))
 print(bs.select('body div[id="header"] div[class="blogStats"] span[id="stats_post_count"]'))

stripped_strings方法的简单应用: 提取出house-name标签下面的所有字符串

from bs4 import BeautifulSoup
import requests
import html5lib

head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="https://gz.centanet.com/ershoufang/", headers=head, timeout=3)
text = str(ret.content.decode('utf-8'))

bs = BeautifulSoup(text,"html5lib")
ret = bs.select('div[class="section"] div[class="house-item clearfix"] p[class="house-name"]')
for i in ret:
    #house = i.get_text()             # 提取出文中的所有字符串以及其格式
    house = list(i.stripped_strings)   # 提取出字符串并以列表的形式返回
    print(house)

实现爬取中国天气网:

from bs4 import BeautifulSoup
import requests
import html5lib

head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="http://www.weather.com.cn/textFC/shandong.shtml", headers=head, timeout=3)
text = str(ret.content.decode('utf-8'))

bs = BeautifulSoup(text,"html5lib")
bs.find_all('div',class_='conMidtab')[1]   # 定位到第一个标签上
tr = bs.find_all('tr')[2:] # 在conMidtab里面找，tr标签并从第3个标签开始保存
for i in tr:
    td = i.find_all('td')  # 循环找代码中的所有td标签
    city_td = td[0]        # 找所有的td标签,并找出第一个td标签
    # stripped_strings 获取目标路径下所有的子孙非标签字符串,自动去掉空字符串
    city = list(city_td.stripped_strings)[0]
    temp = td[-5]          # 取出度数的标签
    temperature = list(temp.stripped_strings)[0]
    print('城市:{}   温度:{}'.format(city,temperature))

使用bs4库爬取西刺代理: 使用库的方式爬取,啪啪啪,三下五除二搞定.

import re
import requests
from bs4 import BeautifulSoup

head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="https://www.xicidaili.com/wt/", headers=head, timeout=3)
bs = BeautifulSoup(ret.text,"lxml")
ret = bs.select('table[id="ip_list"] tr[class="odd"]')

ip=[]
for i in ret:
    house =list(i.stripped_strings)
    ip.append(house)

for i in range(0,50):
    format = "http://{}:{}".format(ip[i][0],ip[i][1])
    print(format,file=open("save.log",'a+',encoding='utf-8'))
    print("代理地址(已保存) {}".format(format))

request使用代理IP地址

import re
from time import sleep
import requests

head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
proxy = { "http":"http://127.0.0.1:9999" }
# 无密码写法："http": "http://ip:端口号"
# 有密码写法："https": "https://username::端口号"

file = open("save.log","r",encoding="utf-8")
for i in file.readlines():
    data = "".join(i.split('\n'))   # 去除空格
    proxy.update(http=data)         # 更新proxy中的数据为当前行
    ret = requests.get(url="https://www.cnblogs.com/LyShark/", headers=head, timeout=3, proxies=proxy)
    if ret.status_code == 200:
        print("代理:{}  访问完成".format(proxy["http"]))
    else:
        print("代理:{}  不在线,失败".format(proxy["http"]))
    sleep(1)

request代理下载文件

import requests

head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
proxy = { "http":"http://117.69.200.46:9999" }

url = "https://nmap.org/dist/nmap-7.80-win32.zip"

ret = requests.get(url=url, headers=head,stream=True,proxies=proxy)
fp = open("nmap.zip","wb")

for chunk in ret.iter_content(chunk_size=4096):
    if chunk:
        print("本次保存长度:{} ".format(len(chunk)))
        fp.write(chunk)

python python爬虫 response urllib