Python 实现简单的爬虫
Python 是一种跨平台的计算机程序设计语言,面向对象动态类型语言,Python是纯粹的自由软件,源代码和解释器CPython遵循 GPL(GNU General Public License)协议,随着版本的不断更新和语言新功能的添加,Python 越来越多被用于独立的、大型项目的开发。
快速抓取网页: 使用urllib最基本的抓取功能,将百度首页的内容保存到本地目录下.
>>> import urllib.request >>> >>> res=urllib.request.urlopen("https://www.baidu.com") >>> print(res.read().decode("utf-8")) >>> f=open("./test.html","wb") #保存在本地 >>> f.write(res.read()) >>> f.close()
实现POST请求: 上述的例子是通过请求百度的get请求获得百度,下面使用urllib的post请求.
>>> import urllib.parse >>> import urllib.request >>> >>> data=bytes(urllib.parse.urlencode({"hello":"lyshark"}),encoding="utf-8") >>> print(data) >>> response = urllib.request.urlopen('http://www.baidu.com/post',data=data) >>> print(response.read())
设置TIMEOUT时间: 我们需要给请求设置一个超时时间,而不是让程序一直在等待结果.
import urllib.request response = urllib.request.urlopen('http://www.baidu.com', timeout=1) print(response.read())
获取网站状态: 我们可以通过status、getheaders(),getheader("server"),获取状态码以及头部信息.
>>> import urllib.request >>> >>> res=urllib.request.urlopen("https://www.python.org") >>> print(type(res)) <class 'http.client.HTTPResponse'> >>> >>> res.status >>> res.getheaders() >>> res.getheader("server")
伪装访问网站: 给请求添加头部信息,从而定制自己请求网站是时的头部信息,防止被和谐.
from urllib import request,parse url = 'http://www.baidu.com' headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Host': 'mkdirs.org' } dict = { 'name': 'LyShark' } data = bytes(parse.urlencode(dict), encoding='utf8') req = request.Request(url=url, data=data, headers=headers, method='POST') response = request.urlopen(req) print(response.read().decode('utf-8'))
简单的URL页面拼接:
import re def Get_Url(target,start,ends): urls=[] for i in range(start,ends): url = target+"/"+str(i) urls.append(url) return urls if __name__ == "__main__": url = Get_Url("https://www.mzitu.com/214261",1,10) print(url)
request库的使用:
import re import requests head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} if __name__ == "__main__": ret = requests.get(url="https://www.mzitu.com/214261", headers=head, timeout=1) all_pic_link = re.findall('<img src="(.*?)"', ret.text, re.S) print(all_pic_link)
简单实现爬取图片:
import re import urllib.request def open_url(url): ret = urllib.request.Request(url) ret.add_header('user-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36') page = urllib.request.urlopen(ret) html =page.read().decode("utf-8") return html def get_img(html): ret = re.findall('<img src="([^"]+\.jpg)"',html) for each in ret: filename = each.split("/")[-1] print("完整路径:",each) print("文件名称:",filename) urllib.request.urlretrieve(each,filename,None) if __name__ == '__main__': url = open_url("https://www.mzitu.com/210402") get_img(url)
爬每日CVE漏洞列表:
import re import requests from bs4 import BeautifulSoup head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} def Get_CVE(url): new_cve = [] ret = requests.get(url=url, headers=head, timeout=3) bs = BeautifulSoup(ret.text, 'html.parser') for i in bs.find_all('a'): href = i.get('href') new_cve.append(href) return(new_cve) def Get_Number(list): new = [] for i in list: temp = re.findall("[0-9]{1,}-.*", str(i)) new.append("CVE-{}".format(temp)) return new if __name__ == "__main__": url= "https://cassandra.cerias.purdue.edu/CVE_changes/today.html" cve = Get_CVE(url) number = Get_Number(cve) for i in number: print("今日份的漏洞:",i)
简单爬取西刺代理地址: 此处我们就用简单的正则匹配爬取,该方法比较笨拙.
import re import requests head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} ret = requests.get(url="https://www.xicidaili.com/nn/1", headers=head, timeout=3) data = re.findall('<td>.*</td>', ret.text) sum =0 for i in range(0,20): IP = data[sum].replace("<td>","").replace("</td>","") Port = data[sum+1].replace("<td>","").replace("</td>","") Type = data[sum+2].replace("<td>","").replace("</td>","") times = data[sum+3].replace("<td>","").replace("</td>","") year = data[sum+4].replace("<td>","").replace("</td>","") print("IP地址:{} 端口号:{} 类型:{} 生存周期:{} 时间:{}".format(IP,Port,Type,times,year)) sum = sum+5
BeautifulSoup 定位技巧: 使用bs库需要安装,三个依赖包 pip install requests bs4 lxml
from bs4 import BeautifulSoup import requests head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} ret = requests.get(url="https://lyshark.cnblogs.com", headers=head, timeout=3) ret.encoding="utf-8" # 出现乱码需要改这里 bs = BeautifulSoup(ret.text,"lxml") # 查找head头节点里面的所有link标签,过滤出0个里面的,href成员 print(bs.select('head link')[0]['href']) # 查找文中所有a标签,且类名是c_b_p_desc_readmore的,并提取出其href字段 print(bs.find_all('a',class_='c_b_p_desc_readmore')[0]['href']) # 提取所有a标签,且id等于blog_nav_admin类等于menu,并提取出其href字段 print(bs.find_all('a',id='blog_nav_admin',class_='menu')[0]['href']) print(bs.find_all('a',id='blog_nav_admin',class_='menu')[0].attrs['href']) # 提取DIV标签里面,id是page_begin_html且里面是link标签的 print(bs.select('div[id="page_begin_html"] link')[0]['href']) print(bs.select('ul[id="navList"] .menu')[0]['href']) # 提取 body 标签下面的 div标签并且匹配id=page_begin_html标签里面第1个link元素 print(bs.select('body > div[id="page_begin_html"] > link')[0]) # 提取指定标签里面的内容 print(bs.select('title')[0].get_text()) print(bs.select('a[href="https://www.cnblogs.com/LyShark/archive/2019/12/04.html"]')) # 定位body标签下面的div下面子标签div下面的span标签 print(bs.select('div[id="header"] div[id="blogTitle"] a[id="lnkBlogLogo"]')) print(bs.select('body div[id="header"] div[class="blogStats"] span[id="stats_post_count"]'))
stripped_strings方法的简单应用: 提取出house-name标签下面的所有字符串
from bs4 import BeautifulSoup import requests import html5lib head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} ret = requests.get(url="https://gz.centanet.com/ershoufang/", headers=head, timeout=3) text = str(ret.content.decode('utf-8')) bs = BeautifulSoup(text,"html5lib") ret = bs.select('div[class="section"] div[class="house-item clearfix"] p[class="house-name"]') for i in ret: #house = i.get_text() # 提取出文中的所有字符串以及其格式 house = list(i.stripped_strings) # 提取出字符串并以列表的形式返回 print(house)
实现爬取中国天气网:
from bs4 import BeautifulSoup import requests import html5lib head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} ret = requests.get(url="http://www.weather.com.cn/textFC/shandong.shtml", headers=head, timeout=3) text = str(ret.content.decode('utf-8')) bs = BeautifulSoup(text,"html5lib") bs.find_all('div',class_='conMidtab')[1] # 定位到第一个标签上 tr = bs.find_all('tr')[2:] # 在conMidtab里面找,tr标签并从第3个标签开始保存 for i in tr: td = i.find_all('td') # 循环找代码中的所有td标签 city_td = td[0] # 找所有的td标签,并找出第一个td标签 # stripped_strings 获取目标路径下所有的子孙非标签字符串,自动去掉空字符串 city = list(city_td.stripped_strings)[0] temp = td[-5] # 取出度数的标签 temperature = list(temp.stripped_strings)[0] print('城市:{} 温度:{}'.format(city,temperature))
使用bs4库爬取西刺代理: 使用库的方式爬取,啪啪啪,三下五除二搞定.
import re import requests from bs4 import BeautifulSoup head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} ret = requests.get(url="https://www.xicidaili.com/wt/", headers=head, timeout=3) bs = BeautifulSoup(ret.text,"lxml") ret = bs.select('table[id="ip_list"] tr[class="odd"]') ip=[] for i in ret: house =list(i.stripped_strings) ip.append(house) for i in range(0,50): format = "http://{}:{}".format(ip[i][0],ip[i][1]) print(format,file=open("save.log",'a+',encoding='utf-8')) print("代理地址(已保存) {}".format(format))
request使用代理IP地址
import re from time import sleep import requests head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} proxy = { "http":"http://127.0.0.1:9999" } # 无密码写法:"http": "http://ip:端口号" # 有密码写法:"https": "https://username::端口号" file = open("save.log","r",encoding="utf-8") for i in file.readlines(): data = "".join(i.split('\n')) # 去除空格 proxy.update(http=data) # 更新proxy中的数据为当前行 ret = requests.get(url="https://www.cnblogs.com/LyShark/", headers=head, timeout=3, proxies=proxy) if ret.status_code == 200: print("代理:{} 访问完成".format(proxy["http"])) else: print("代理:{} 不在线,失败".format(proxy["http"])) sleep(1)
request代理下载文件
import requests head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} proxy = { "http":"http://117.69.200.46:9999" } url = "https://nmap.org/dist/nmap-7.80-win32.zip" ret = requests.get(url=url, headers=head,stream=True,proxies=proxy) fp = open("nmap.zip","wb") for chunk in ret.iter_content(chunk_size=4096): if chunk: print("本次保存长度:{} ".format(len(chunk))) fp.write(chunk)
相关推荐
夜斗不是神 2020-11-17
染血白衣 2020-11-16
YENCSDN 2020-11-17
lsjweiyi 2020-11-17
houmenghu 2020-11-17
Erick 2020-11-17
HeyShHeyou 2020-11-17
以梦为马不负韶华 2020-10-20
lhtzbj 2020-11-17
pythonjw 2020-11-17
dingwun 2020-11-16
lhxxhl 2020-11-16
坚持是一种品质 2020-11-16
huavhuahua 2020-11-20
meylovezn 2020-11-20
逍遥友 2020-11-20
weiiron 2020-11-16