python：狂抓“某逼乎”精彩话题，撞破“某榴”新手账号

奇点Peter

2019-06-29

上一篇写了个有意思的文章：python 脚本撞库国内“某榴”账号 https://www.52pojie.cn/thread...
很多朋友反映，该榴账号有google验证，即时撞破账号也无卵用，其实新手号还是可以使用的，至于撞库破解“某榴”账号的问题请移到上篇帖子查看。

华丽分割线

这次再来研究下如何搞定“某逼乎”的话题问题。
逼乎现在在整个社区类网站中可以说火的不要不要的，逼乎上的内容质量在所有社区中还是相对较高的，很多时候我们都需要爬取逼乎精彩的话题，当然这不是为了装，搞不好你的设计恰好
就需要这么一个需求。

程序猿之间上代码，一起研究下：

"""
@author:haoning
[url=home.php?mod=space&uid=365491]@create[/url] time:2015.8.5
"""
from future import division # 精确除法
from Queue import Queue
from builtin import False
import json
import os
import re
import platform
import uuid
import urllib
import urllib2
import sys
import time
import MySQLdb as mdb
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding( "utf-8" )

headers = {
'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With':'XMLHttpRequest',
'Referer':'https://www.zhihu.com/topics',
'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'
}

DB_HOST = '127.0.0.1'
DB_USER = 'root'
DB_PASS = 'root'

queue= Queue() #接收队列
nodeSet=set()
keywordSet=set()
stop=0
offset=-20
level=0
maxLevel=7
counter=0
base=""

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')
conn.autocommit(False)
curr = conn.cursor()

def get_html(url):

try:
    req = urllib2.Request(url)
    response = urllib2.urlopen(req,None,3) #在这里应该加入代{过}{滤}理
    html = response.read()
    return html
except:
    pass
return None

def getTopics():

url = 'https://www.zhihu.com/topics'
print url
try:
    req = urllib2.Request(url)
    response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞&#65533;
    html = response.read().decode('utf-8')
    print html
    soup = BeautifulSoup(html)
    lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'})
     
    for li in lis:
        data_id=li.get('data-id')
        name=li.text
        curr.execute('select id from classify_new where name=%s',(name))
        y= curr.fetchone()
        if not y:
            curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name))
    conn.commit()
except Exception as e:
    print "get topic error",e

def get_extension(name):

where=name.rfind('.')
if where!=-1:
    return name[where:len(name)]
return None

def which_platform():

sys_str = platform.system()
return sys_str

def GetDateString():

when=time.strftime('%Y-%m-%d',time.localtime(time.time()))
foldername = str(when)
return foldername

def makeDateFolder(par,classify):

try:
    if os.path.isdir(par):
        newFolderName=par + '//' + GetDateString() + '//'  +str(classify)
        if which_platform()=="Linux":
            newFolderName=par + '/' + GetDateString() + "/" +str(classify)
        if not os.path.isdir( newFolderName ):
            os.makedirs( newFolderName )
        return newFolderName
    else:
        return None
except Exception,e:
    print "kk",e
return None

def download_img(url,classify):

try:
    extention=get_extension(url)
    if(extention is None):
        return None
    req = urllib2.Request(url)
    resp = urllib2.urlopen(req,None,3)
    dataimg=resp.read()
    name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
    top="E://topic_pic"
    folder=makeDateFolder(top, classify)
    filename=None
    if folder is not None:
        filename  =folder+"//"+name
    try:
        if "e82bab09c_m" in str(url):
            return True
        if not os.path.exists(filename):
            file_object = open(filename,'w+b')
            file_object.write(dataimg)
            file_object.close()
            return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name
        else:
            print "file exist"
            return None
    except IOError,e1:
        print "e1=",e1
        pass
except Exception as e:
    print "eee",e
    pass
return None #如果没有下载下来就利用原来网站的链接

def getChildren(node,name):

global queue,nodeSet
try:
    url="https://www.zhihu.com/topic/"+str(node)+"/hot"
    html=get_html(url)
    if html is None:
        return
    soup = BeautifulSoup(html)
    p_ch='父话题'
    node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
    topic_cla=soup.find('div', {'class' : 'child-topic'})
    if topic_cla is not None:
        try:
            p_ch=str(topic_cla.text)
            aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点
            if u'子话题' in p_ch:
                for a in aList:
                    token=a.get('data-token')
                    a=str(a).replace('\n','').replace('\t','').replace('\r','')
                    start=str(a).find('>')
                    end=str(a).rfind('</a>')
                    new_node=str(str(a)[start+1:end])
                    curr.execute('select id from rooms where name=%s',(new_node)) #先保证名字绝不相同
                    y= curr.fetchone()
                    if not y:
                        print "y=",y,"new_node=",new_node,"token=",token
                        queue.put((token,new_node,node_name))
        except Exception as e:
            print "add queue error",e
except Exception as e:
    print "get html error",e

def getContent(n,name,p,top_id):

try:
    global counter
    curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同
    y= curr.fetchone()
    print "exist?? ",y,"n=",n
    if not y:
        url="https://www.zhihu.com/topic/"+str(n)+"/hot"
        html=get_html(url)
        if html is None:
            return
        soup = BeautifulSoup(html)
        title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
        pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')
        description=soup.find('div',{'class':'zm-editable-content'})
        if description is not None:
            description=description.text
             
        if (u"未归类" in title or u"根话题" in title): #允许入库，避免死循环
            description=None
             
        tag_path=download_img(pic_path,top_id)
        print "tag_path=",tag_path
        if (tag_path is not None) or tag_path==True:
            if tag_path==True:
                tag_path=None
            father_id=2 #默认为杂谈
            curr.execute('select id from rooms where name=%s',(p))
            results = curr.fetchall()
            for r in results:
                father_id=r[0]
            name=title
            curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同
            y= curr.fetchone()
            print "store see..",y
            if not y:
                friends_num=0
                temp = time.time()
                x = time.localtime(float(temp))
                create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
                create_time
                creater_id=None
                room_avatar=tag_path
                is_pass=1
                has_index=0
                reason_id=None 
                #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id
                ######################有资格入库的内容
                counter=counter+1
                curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))
                conn.commit() #必须时时进入数据库，不然找不到父节点
                if counter % 200==0:
                    print "current node",name,"num",counter
except Exception as e:
    print "get content error",e

def work():

global queue
curr.execute('select id,node,parent,name from classify where status=1')
results = curr.fetchall()
for r in results:
    top_id=r[0]
    node=r[1]
    parent=r[2]
    name=r[3]
    try:
        queue.put((node,name,parent)) #首先放入队列
        while queue.qsize() >0:
            n,p=queue.get() #顶节点出队
            getContent(n,p,top_id)
            getChildren(n,name) #出队内容的子节点
        conn.commit()
    except Exception as e:
        print "what's wrong",e

def new_work():

global queue
curr.execute('select id,data_id,name from classify_new_copy where status=1')
results = curr.fetchall()
for r in results:
    top_id=r[0]
    data_id=r[1]
    name=r[2]
    try:
        get_topis(data_id,name,top_id)
    except:
        pass

def get_topis(data_id,name,top_id):

global queue
url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'
isGet = True;
offset = -20;
data_id=str(data_id)
while isGet:
    offset = offset + 20
    values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}
    try:
        msg=None
        try:
            data = urllib.urlencode(values)
            request = urllib2.Request(url,data,headers)
            response = urllib2.urlopen(request,None,5)
            html=response.read().decode('utf-8')
            json_str = json.loads(html)
            ms=json_str['msg']
            if len(ms) <5:
                break
            msg=ms[0]
        except Exception as e:
            print "eeeee",e
        #print msg
        if msg is not None:
            soup = BeautifulSoup(str(msg))
            blks = soup.find_all('div', {'class' : 'blk'})
            for blk in blks:
                page=blk.find('a').get('href')
                if page is not None:
                    node=page.replace("/topic/","") #将更多的种子入库
                    parent=name
                    ne=blk.find('strong').text
                    try:
                        queue.put((node,ne,parent)) #首先放入队列
                        while queue.qsize() >0:
                            n,name,p=queue.get() #顶节点出队
                            size=queue.qsize()
                            if size > 0:
                                print size
                            getContent(n,name,p,top_id)
                            getChildren(n,name) #出队内容的子节点
                        conn.commit()
                    except Exception as e:
                        print "what's wrong",e  
    except urllib2.URLError, e:
        print "error is",e
        pass

if name == '__main__':

i=0
while i<400:
    new_work()
    i=i+1

当然代码是十分简单的，稍微有python基础都可以搞定，注释清楚明白，大家安静讨论研究下，献丑了。

python

安科网

python：狂抓“某逼乎”精彩话题，撞破“某榴”新手账号

奇点Peter

华丽分割线

奇点Peter

相关推荐

python 发送get请求接口详解

python 使用tkinter+you-get实现视频下载器

python中requests模拟登录的三种方式(携带cookie/session进行请求网站)

python开发一个解析protobuf文件的简单编译器

python 下载文件的多种方法汇总

Linux Shell 如何获取参数的方法

python跨文件使用全局变量的实现

Python爬虫破解登陆哔哩哔哩的方法

python调用百度API实现人脸识别

Python调用ffmpeg开源视频处理库，批量处理视频

详解python os.path.exists判断文件或文件夹是否存在

python实现在列表中查找某个元素的下标示例

python如何获得list或numpy数组中最大元素对应的索引

Python实现列表索引批量删除的5种方法

python 爬虫如何实现百度翻译

致命错误！Python开发者的7个崩溃瞬间

针对Python开发人员的10个“疯狂”的项目构想

用Python内置模块处理ini配置文件

VS Code 中 Python 扩展的部分功能重构，支持 R 和 Julia

Python五个隐藏的特性，你可能从未听说过

奇点Peter