python抓取新浪微博,被挡,用了代理,有10个帐号,10个代理,爬的很慢,大家有什么好的办法,谢谢!!!回复内容:
http://github.com/zhu327/rss 既然你也用python就直接看代码吧爬这里 http://service.weibo.com/widget/widget_blog.php?uid={uid} 替换uid,无需登录,不会被挡
爬手机端http://weibo.cn可以参考下面的代码,来自极客学院,侵删
#-*-coding:utf8-*-
import smtplib
from email.mime.text import mimetext
import requests
from lxml import etree
import os
import time
import sys
reload(sys)
sys.setdefaultencoding(‘utf-8′)
class mailhelper(object):
”’
这个类实现发送邮件的功能
”’
def __init__(self):
self.mail_host=”smtp.xxxx.com” #设置服务器
self.mail_user=”xxxx” #用户名
self.mail_pass=”xxxx” #密码
self.mail_postfix=”xxxx.com” #发件箱的后缀
def send_mail(self,to_list,sub,content):
me=”xxoohelper”+””
msg = mimetext(content,_subtype=’plain’,_charset=’utf-8′)
msg[‘subject’] = sub
msg[‘from’] = me
msg[‘to’] = “;”.join(to_list)
try:
server = smtplib.smtp()
server.connect(self.mail_host)
server.login(self.mail_user,self.mail_pass)
server.sendmail(me, to_list, msg.as_string())
server.close()
return true
except exception, e:
print str(e)
return false
class xxoohelper(object):
”’
这个类实现将爬取微博第一条内容
”’
def __init__(self):
self.url = ‘http://weibo.cn/u/xxxxxxx’ #请输入准备抓取的微博地址
self.url_login = ‘https://login.weibo.cn/login/’
self.new_url = self.url_login
def getsource(self):
html = requests.get(self.url).content
return html
def getdata(self,html):
selector = etree.html(html)
password = selector.xpath(‘//input[@type=”password”]/@name’)[0]
vk = selector.xpath(‘//input[@name=”vk”]/@value’)[0]
action = selector.xpath(‘//form[@method=”post”]/@action’)[0]
self.new_url = self.url_login + action
data = {
‘mobile’ : ‘xxxxx@xxx.com’,
password : ‘xxxxxx’,
‘remember’ : ‘on’,
‘backurl’ : ‘http://weibo.cn/u/xxxxxx’, #此处请修改为微博地址
‘backtitle’ : u’微博’,
‘trycount’ : ”,
‘vk’ : vk,
‘submit’ : u’登录’
}
return data
def getcontent(self,data):
newhtml = requests.post(self.new_url,data=data).content
new_selector = etree.html(newhtml)
content = new_selector.xpath(‘//span[@]’)
newcontent = unicode(content[2].xpath(‘string(.)’)).replace(‘http://’,”)
sendtime = new_selector.xpath(‘//span[@]/text()’)[0]
sendtext = newcontent + sendtime
return sendtext
def tosave(self,text):
f= open(‘weibo.txt’,’a’)
f.write(text + ‘\n’)
f.close()
def tocheck(self,data):
if not os.path.exists(‘weibo.txt’):
return true
else:
f = open(‘weibo.txt’, ‘r’)
existweibo = f.readlines()
if data + ‘\n’ in existweibo:
return false
else:
return true
if __name__ == ‘__main__’:
mailto_list=[‘xxxxx@qq.com’] #此处填写接收邮件的邮箱
helper = xxoohelper()
while true:
source = helper.getsource()
data = helper.getdata(source)
content = helper.getcontent(data)
if helper.tocheck(content):
if mailhelper().send_mail(mailto_list,u”女神更新啦”,content):
print u”发送成功”
else:
print u”发送失败”
helper.tosave(content)
print content
else:
print u’pass’
time.sleep(30)
据说爬手机版会有奇效。
我以前爬过,不知道现在可行不爬他的移动端页面,当时限制比网页端少。爬虫程序部署在google app engine多个节点上跑
新浪有开发者平台,有专门的api接口,用爬虫会被屏蔽