本文给大家分享的是使用python通过搜狗入口,爬取微信文章的小程序,非常的简单实用,有需要的小伙伴可以参考下
本人想搞个采集微信文章的网站,无奈实在从微信本生无法找到入口链接,网上翻看了大量的资料,发现大家的做法总体来说大同小异,都是以搜狗为入口。下文是笔者整理的一份python爬取微信文章的代码,有兴趣的欢迎阅读
#coding:utf-8
author = ‘haoning’
**#!/usr/bin/env python
import time
import datetime
import requests**
import json
import sys
reload(sys)
sys.setdefaultencoding( “utf-8” )
import re
import xml.etree.elementtree as et
import os
#openid = ‘oiwsftyel13zmva1qltq3pfejlwu’
openid = ‘oiwsftw_-w2dahwrz1ogwzl-wf9m&ext’
xml_list = []
# get current time in milliseconds
current_milli_time = lambda: int(round(time.time() * 1000))
def get_json(pageindex):
global openid
the_headers = {
‘user-agent’: ‘mozilla/5.0 (macintosh; intel mac os x 10_9_5) applewebkit/537.36 (khtml, like gecko) chrome/39.0.2171.95 safari/537.36’,
‘referer’: ‘http://weixin.sogou.com/gzh?open.format(openid),
‘host’: ‘weixin.sogou.com’
}
url = ‘http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&open.format(openid, pageindex, current_milli_time()) #url
print(url)
response = requests.get(url, headers = the_headers)
# to-do; check if match the reg
response_text = response.text
print response_text
json_start = response_text.index(‘sogou.weixin.gzhcb(‘) + 19
json_end = response_text.index(‘)’) – 2
json_str = response_text[json_start : json_end] #get json
#print(json_str)
# convert json_str to json object
json_obj = json.loads(json_str) #get json obj
# print json_obj[‘totalpages’]
return json_obj
def add_xml(jsonobj):
global xml_list
xmls = jsonobj[‘items’] #get item
#print type(xmls)
xml_list.extend(xmls) #用新列表扩展原来的列表
**[#www.oksousou.com][2]**
# ———— main —————-
print ‘play it 🙂 ‘
# get total pages
default_json_obj = get_json(1)
total_pages = 0
total_items = 0
if(default_json_obj):
# add the default xmls
add_xml(default_json_obj)
# get the rest items
total_pages = default_json_obj[‘totalpages’]
total_items = default_json_obj[‘totalitems’]
print total_pages
# iterate all pages
if(total_pages >= 2):
for pageindex in range(2, total_pages + 1):
add_xml(get_json(pageindex)) #extend
print ‘load page ‘ + str(pageindex)
print len(xml_list)
以上就是python爬取微信文章方法的详细内容,更多请关注 第一php社区 其它相关文章!