Python第一条网络爬虫,爬取一个网页的内容
一、球赛结果预测代码部分函数测试。 二、用requests库函数访问搜狗网页20次。 import requests from bs4 import BeautifulSoup def getHTMLText(self): try: r=requests.get(url,timeout=30)#如果状态不是200,引发异常 r.raise_for_status() #无论原来用什么编码,都改成utf-8 r.encoding =‘utf-8‘ soup=BeautifulSoup(r.text) return r.text,r.status_code,len(r.text),r.encoding,len(soup.text) except: return "" url="https://www.sogou.com" print(getHTMLText(url)) for i in range(20): print("第{}次访问".format(i+1)) print(getHTMLText(url))
三、HTML按要求打印网页。
<!DOCTYPE html> <html> <head> <meta charset="utf-8"> <font size="10" color="purple"> <title>我的第一个网页</title> <xuehao>我的学号:2019310143143</xuehao> </font> </head> <body> <p style="background-color:rgb(255,0,255)"> </p> <font size="5" color="blue"> <h1>欢迎来到你的三连我的心</h1> <h2>好懒人就是我,我就是好懒人</h2> <h3>此人很懒,无精彩内容可示</h3> <h4>仅有鄙人陋见或许对你有用</h4> <a href="https://home.cnblogs.com/u/LSH1628340121/">QAQ打开看看嘛QAQ</a> <p id="first">我的观众老爷们,给个三连。</p> </font> <img src="F:\荷花.jpg" alt="荷花" width="900" height="800"> </body> <table border="1"> <tr> <td>点赞, 收藏,关注</td> </tr> <tr> <td>投币1, 投币2</td> </tr> </table> </html>
四、爬取2016大学排名并存为CSV文件
import requests from bs4 import BeautifulSoup import csv allUniv=[] def getHTMLText (url): try: r = requests.get(url,timeout=30) r.raise_for_status() r.encoding = ‘utf-8‘ return r.text except: return "" def fillUnivList (soup): data = soup.find_all(‘tr‘) for tr in data: ltd = tr.find_all(‘td‘) if len(ltd)==0: continue singleUniv=[] for td in ltd: singleUniv.append(td.string) allUniv.append(singleUniv) write_csv(allUniv) def printUnivList (num): print("{:^4}{:^10}{:^5}{:^8}{:^10}". format("排名",26 "学校名称", "省市","总分","培养规模")) for i in range (num): u=allUniv[i] print("{:^4}{:^10}{:^5}{:^8}{:^10}".format(u[0],30 u[1],u[2],u[3],u[6])) return u def write_csv(list): name = [‘排名‘, ‘学校名称‘, ‘省份‘, ‘总分‘, ‘生源质量(新生高考成绩得分)‘, ‘培养结果(毕业生就业率)‘, ‘社会声誉(社会捐赠收入·千元)‘, ‘科研规模(论文数量·篇)‘,34 ‘科研质量(论文质量·FWCI)‘, ‘顶尖成果(高被引论文·篇)‘, ‘顶尖人才(高被引学者·人)‘, ‘科技服务(企业科研经费·千元)‘, ‘成果转化(技术转让收入·千元)‘] with open(‘C:/Users/86188/Desktop/新建文件夹/最好大学排名.csv‘, ‘w‘) as f: writer = csv.writer(f) writer.writerow(name) for row in list: writer.writerow(row) def main (num) : url = ‘http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html‘ html = getHTMLText (url) soup = BeautifulSoup(html, "html.parser") fillUnivList(soup) printUnivList(num) print("排名情况如上所示") main(310)