python基础
一篇文章带你了解《python爬虫》
1. 什么是网络爬虫:
1. 通俗理解:爬虫是一个模拟人类请求网站行为的程序。可以自动请求网页、并数据抓取下来,然后使用一定的规则提取有价值的数据。
2. 专业介绍:百度百科。
2. 进入主题:
2.1 python urllib:
# urllib_01.py
#!-*- coding:utf-8 -*- import urllib.request url = ‘http://www.baidu.com‘ print (‘第一种方法 --> 直接请求 ‘) response1 = urllib.request.urlopen(url) #获取状态码,如果是200表示获取成功 print ("响应状态码 : ",response1.getcode()) # 获取读取到的内容的长度 content=response1.read() print ("响应内容的长度 : ",len(content)) # 读取内容 requestUrl=response1.geturl() print(requestUrl) print(response1.info())
# urllib_02.py
# coding=utf-8 from urllib import request import re, json url = "http://sports.163.com/special/000587PN/newsdata_world_yj.js?callback=data_callback" response = request.urlopen(url).read().decode("gbk") with open("163/1.txt", "w", encoding="utf8") as f: f.write(str(response))
# urllib_03.py
#coding=utf-8 #使用request模块模拟浏览器请求拉勾网页面 from urllib import request url = r‘http://www.lagou.com/zhaopin/Python/?labelWords=label‘ headers = { ‘User-Agent‘: r‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ‘ r‘Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3‘, ‘Referer‘: r‘http://www.lagou.com/zhaopin/Python/?labelWords=label‘, ‘Connection‘: ‘keep-alive‘ } req = request.Request(url, headers=headers) page = request.urlopen(req).read() #设定下载内容的编码为utf-8 page = page.decode(‘utf-8‘) print(page)
# urllib_04.py
#coding=utf-8 #模拟iPhone 6去请求豆瓣首页 from urllib import request req = request.Request(‘http://www.douban.com/‘) req.add_header(‘User-Agent‘, ‘Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25‘) with request.urlopen(req) as f: print(‘Status:‘, f.status, f.reason) for k, v in f.getheaders(): print(‘%s: %s‘ % (k, v)) print(‘Data:‘, f.read().decode(‘utf-8‘))
# urllib_05.py
#coding=utf-8 #新浪微博模拟登录 from urllib import request, parse print(‘Login to weibo.cn...‘) email = input(‘Email: ‘) passwd = input(‘Password: ‘) login_data = parse.urlencode([ (‘username‘, email), (‘password‘, passwd), (‘entry‘, ‘mweibo‘), (‘client_id‘, ‘‘), (‘savestate‘, ‘1‘), (‘ec‘, ‘‘), (‘pagerefer‘, ‘https://passport.weibo.cn/signin/welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F‘) ]) req = request.Request(‘https://passport.weibo.cn/sso/login‘) req.add_header(‘Origin‘, ‘https://passport.weibo.cn‘) req.add_header(‘User-Agent‘, ‘Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25‘) req.add_header(‘Referer‘, ‘https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F‘) with request.urlopen(req, data=login_data.encode(‘utf-8‘)) as f: print(‘Status:‘, f.status, f.reason) for k, v in f.getheaders(): print(‘%s: %s‘ % (k, v)) print(‘Data:‘, f.read().decode(‘utf-8‘))
# urllib_06.py
#coding=utf-8 from urllib import request if __name__ == "__main__": req = request.Request("http://fanyi.baidu.com/") response = request.urlopen(req) print("geturl打印信息:%s"%(response.geturl())) print(‘**********************************************‘) print("info打印信息:%s"%(response.info())) print(‘**********************************************‘) print("getcode打印信息:%s"%(response.getcode()))
# urllib_07.py
# -*- coding: UTF-8 -*- from urllib import request from urllib import error if __name__ == "__main__": #一个不存在的连接 url = "http://www.abcdhaha2.com/" req = request.Request(url) try: response = request.urlopen(req) html = response.read().decode(‘utf-8‘) print(html) except error.URLError as e: print(e.reason)
2.2
相关推荐
HeyShHeyou 2020-11-17
YENCSDN 2020-11-17
lsjweiyi 2020-11-17
houmenghu 2020-11-17
Erick 2020-11-17
以梦为马不负韶华 2020-10-20
lhtzbj 2020-11-17
夜斗不是神 2020-11-17
pythonjw 2020-11-17
dingwun 2020-11-16
lhxxhl 2020-11-16
坚持是一种品质 2020-11-16
染血白衣 2020-11-16
huavhuahua 2020-11-20
meylovezn 2020-11-20
逍遥友 2020-11-20
weiiron 2020-11-16