鉴于之前用python写爬虫,帮运营人员抓取过京东的商品品牌以及分类,这次也是用python来搞简单的抓取单页面版,后期再补充哈。
#-*- coding: utf-8 -*-
import requests
import sys
from bs4 import beautifulsoup
#------知乎答案收集----------
#获取网页body里的内容
def get_content(url , data = none):
header={
‘accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8’,
‘accept-encoding’: ‘gzip, deflate, sdch’,
‘accept-language’: ‘zh-cn,zh;q=0.8’,
‘connection’: ‘keep-alive’,
‘user-agent’: ‘mozilla/5.0 (windows nt 6.3; wow64) applewebkit/537.36 (khtml, like gecko) chrome/43.0.235’
}
req = requests.get(url, headers=header)
req.encoding = ‘utf-8’
bs = beautifulsoup(req.text, “html.parser”) # 创建beautifulsoup对象
body = bs.body # 获取body部分
return body
#获取问题标题
def get_title(html_text):
data = html_text.find(‘span’, {‘class’: ‘zm-editable-content’})
return data.string.encode(‘utf-8’)
#获取问题内容
def get_question_content(html_text):
data = html_text.find(‘p’, {‘class’: ‘zm-editable-content’})
if data.string is none:
out = ”;
for datastring in data.strings:
out = out + datastring.encode(‘utf-8’)
print ‘内容:\n’ + out
else:
print ‘内容:\n’ + data.string.encode(‘utf-8’)
#获取点赞数
def get_answer_agree(body):
agree = body.find(‘span’,{‘class’: ‘count’})
print ‘点赞数:’ + agree.string.encode(‘utf-8’) + ‘\n’
#获取答案
def get_response(html_text):
response = html_text.find_all(‘p’, {‘class’: ‘zh-summary summary clearfix’})
for index in range(len(response)):
#获取标签
answerhref = response[index].find(‘a’, {‘class’: ‘toggle-expand’})
if not(answerhref[‘href’].startswith(‘javascript’)):
url = ‘http://www.zhihu.com/’ + answerhref[‘href’]
print url
body = get_content(url)
get_answer_agree(body)
answer = body.find(‘p’, {‘class’: ‘zm-editable-content clearfix’})
if answer.string is none:
out = ”;
for datastring in answer.strings:
out = out + ‘\n’ + datastring.encode(‘utf-8’)
print out
else:
print answer.string.encode(‘utf-8’)
html_text = get_content(‘https://www.zhihu.com/question/43879769’)
title = get_title(html_text)
print “标题:\n” + title + ‘\n’
questiondata = get_question_content(html_text)
print ‘\n’
data = get_response(html_text)
输出结果: