python基础

一篇文章带你了解《python爬虫》

1. 什么是网络爬虫:

       1. 通俗理解:爬虫是一个模拟人类请求网站行为的程序。可以自动请求网页、并数据抓取下来,然后使用一定的规则提取有价值的数据。

  2. 专业介绍:百度百科

2. 进入主题:

2.1 python urllib:

# urllib_01.py

#!-*- coding:utf-8 -*-
import urllib.request
url =  ‘http://www.baidu.com‘
print (‘第一种方法 --> 直接请求 ‘)
response1 = urllib.request.urlopen(url)
#获取状态码,如果是200表示获取成功
print ("响应状态码 : ",response1.getcode())
# 获取读取到的内容的长度
content=response1.read()
print ("响应内容的长度 : ",len(content))
# 读取内容
requestUrl=response1.geturl()
print(requestUrl)
print(response1.info())

# urllib_02.py

# coding=utf-8
from urllib import request
import re, json
url = "http://sports.163.com/special/000587PN/newsdata_world_yj.js?callback=data_callback"
response = request.urlopen(url).read().decode("gbk")
with open("163/1.txt", "w", encoding="utf8") as f:
    f.write(str(response))

# urllib_03.py

#coding=utf-8
#使用request模块模拟浏览器请求拉勾网页面
from urllib import request
url = r‘http://www.lagou.com/zhaopin/Python/?labelWords=label‘
headers = {
        ‘User-Agent‘: r‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ‘
        r‘Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3‘,
        ‘Referer‘: r‘http://www.lagou.com/zhaopin/Python/?labelWords=label‘,
        ‘Connection‘: ‘keep-alive‘
        }
req = request.Request(url, headers=headers)
page = request.urlopen(req).read()
#设定下载内容的编码为utf-8
page = page.decode(‘utf-8‘)
print(page)

# urllib_04.py

#coding=utf-8
#模拟iPhone 6去请求豆瓣首页
from urllib import request
req = request.Request(‘http://www.douban.com/‘)
req.add_header(‘User-Agent‘, ‘Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25‘)
with request.urlopen(req) as f:
    print(‘Status:‘, f.status, f.reason)
    for k, v in f.getheaders():
        print(‘%s: %s‘ % (k, v))
    print(‘Data:‘, f.read().decode(‘utf-8‘))

# urllib_05.py

#coding=utf-8
#新浪微博模拟登录
from urllib import request, parse
print(‘Login to weibo.cn...‘)
email = input(‘Email: ‘)
passwd = input(‘Password: ‘)
login_data = parse.urlencode([
    (‘username‘, email),
    (‘password‘, passwd),
    (‘entry‘, ‘mweibo‘),
    (‘client_id‘, ‘‘),
    (‘savestate‘, ‘1‘),
    (‘ec‘, ‘‘),
    (‘pagerefer‘, ‘https://passport.weibo.cn/signin/welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F‘)
])
req = request.Request(‘https://passport.weibo.cn/sso/login‘)
req.add_header(‘Origin‘, ‘https://passport.weibo.cn‘)
req.add_header(‘User-Agent‘, ‘Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25‘)
req.add_header(‘Referer‘, ‘https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F‘)
with request.urlopen(req, data=login_data.encode(‘utf-8‘)) as f:
    print(‘Status:‘, f.status, f.reason)
    for k, v in f.getheaders():
        print(‘%s: %s‘ % (k, v))
    print(‘Data:‘, f.read().decode(‘utf-8‘))

# urllib_06.py

#coding=utf-8
from urllib import request
if __name__ == "__main__":
    req = request.Request("http://fanyi.baidu.com/")
    response = request.urlopen(req)
    print("geturl打印信息:%s"%(response.geturl()))
    print(‘**********************************************‘)
    print("info打印信息:%s"%(response.info()))     
    print(‘**********************************************‘)
    print("getcode打印信息:%s"%(response.getcode()))

# urllib_07.py

# -*- coding: UTF-8 -*-
from urllib import request
from urllib import error

if __name__ == "__main__":
    #一个不存在的连接
    url = "http://www.abcdhaha2.com/"
    req = request.Request(url)
    try:
        response = request.urlopen(req)
        html = response.read().decode(‘utf-8‘)
        print(html)
    except error.URLError as e:
        print(e.reason)

2.2 

相关推荐