python爬虫2:按html标签提取信息和中文域名处理(BeautifulSoup用法初步)
#!/usr/bin/env python # -*- coding: utf-8 -*- # python3 import string import urllib from urllib import request from bs4 import BeautifulSoup url="https://ne0matrix.com/2020/01/08/伊朗,赢了" # 有中文的url,直接urlopen会出错,需要quote处理一下。safe=参数表示不需要被处理的字符,默认为/。现在设为string.printable表示非中文的不需要转换。 url_quote=urllib.parse.quote(url,safe=string.printable) # quote的逆向操作unquote: # url_unquote=urllib.parse.unquote(url_quote print (url_quote) page_read=request.urlopen(url_quote).read() page_decode=page_read.decode(‘utf-8‘) with open (‘output.html‘,‘w‘)as f: f.write(page_decode) with open (‘output.html‘,‘r‘)as f: alltext=f.read() bsobj=BeautifulSoup(alltext,‘html.parser‘) # 如果不加html.parser则使用默认的lxmlparser,会有警告,但不影响使用 print (bsobj.title) # 获取标题<title>... print (bsobj.title.get_text()) # get_text()获取纯文字的标题 date=bsobj.find(‘p‘,{‘class‘:‘mt-3‘}).get_text() print (date.strip()) # strip()去掉前后空格 count=bsobj.find(‘span‘,{‘class‘:‘post-count‘}) print(count.get_text().strip()) text=bsobj.find(‘div‘,{‘class‘:‘markdown-body‘}) print(text.get_text()) # 查找正文
相关推荐
夜斗不是神 2020-11-17
染血白衣 2020-11-16
ARCXIANG 2020-11-02
ARCXIANG 2020-10-28
CycloneKid 2020-10-27
荒谬小孩 2020-10-26
逍遥友 2020-10-26
snakeson 2020-10-09
meylovezn 2020-08-28
囧芝麻 2020-08-17
数据挖掘工人 2020-08-15
cxcxrs 2020-07-28
dashoumeixi 2020-07-20
sunzhihaofuture 2020-07-19
我欲疾风前行 2020-07-06
sunzhihaofuture 2020-07-04
Dimples 2020-06-28