python selenium 采集百家号文章

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import xlwt
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
from urllib import request
import gc
from lxml import etree
from bs4 import BeautifulSoup
import pymysql
#C:\Users\Administrator\AppData\Local\Programs\Python\Python36\python.exe
class Bjh():
def __init__(self):
self.wb = xlwt.Workbook()
chrome_options = webdriver.ChromeOptions()
# extension_path = r’D:\python\work\bj3y\1.0.2_0.crx’
path = “D:\php\chromedriver\chromedriver.exe”
# chrome_options.add_experimental_option(‘w3c’, False)
mobileEmulation = {‘deviceName’: ‘iPhone 6/7/8 Plus’}
chrome_options.add_experimental_option(‘mobileEmulation’, mobileEmulation)
# chrome_options.add_argument(‘–headless’)
# chrome_options.add_argument(“–proxy-server=http://%s”%self.get_ip())
# chrome_options.add_argument(‘user-agent=%s’%random_ua())
chrome_options.add_argument(
“user-data-dir=” + r”C:\wamp\www\python\works\log”)
self.driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_options)

def open(self,url):
self.driver.get(url=url)

def hua(self):
self.driver.maximize_window()
print(“滑动加载中”)
self.driver.execute_script(“window.scrollTo(1000,document.body.scrollHeight)”)
time.sleep(2)

def check_state(self):
data = self.driver.find_element_by_class_name(“s-loader”).get_attribute(‘innerHTML’)
state = str(data).split(“\n”)[1][-3:-2]
return state
def get_article(self):
article = self.driver.find_elements_by_class_name(‘sfi-article’)
for i in article:
title = i.get_attribute(‘title’)
url = i.get_attribute(‘url’)
# print(title + ‘-‘ + url)
title = i.get_attribute(‘title’)
url = i.get_attribute(‘url’)
content = request.urlopen(url).read()
content = BeautifulSoup(content.decode(), ‘html.parser’)
d_time = content.find_all(“span”, class_=”date”)
print(d_time)
t_time = content.find_all(“span”, class_=”time”)
d_time =d_time[0].get_text().replace(‘发布时间：’, ”)
f_time = d_time + ‘ ‘ + t_time[0].get_text()
if(’20’ in d_time):
f_time = ’20’+f_time
else:
f_time = ‘2021-‘+f_time
# 获取内容第一个图片
Imag = content.find_all(‘img’)
content = content.find_all(id=’article’)
cover = Imag[2].get(‘src’)
add_time = int(time.mktime(time.strptime( f_time, “%Y-%m-%d %H:%M”)))
db=DataBase()
if(db.checkTitle(title)):
continue
else:
#add_time = time.time()
db.insertArticle(str(title),str(cover),add_time,content)

def write(self,title,type,data):
# 添加sheet
ws = self.wb.add_sheet(type)
index = 0
for i in data:
if i==[]:
continue
ws.write(index, 0, i[0])
ws.write(index, 1, i[1])
ws.write(index, 2, i[2])
if type==”article”:
ws.write(index, 3, i[3])
index += 1

def run(self,url):
self.open(url)
# element = self.driver.find_elements_by_class_name(‘s-tab’)
# self.driver.execute_script(“arguments[0].click();”, element[2])
i=0
while True:
self.hua()
i=i+1
print(i)
#if i==15 or i==2000 or i==2500 or i==3000:
#self.get_article()
state = self.check_state()
if state == “2”:
break
self.get_article()
self.driver.refresh()
self.driver.implicitly_wait(1)
self.driver.refresh()
self.driver.implicitly_wait(1)
#self.wb.save(“baijia” + “.xls”)
print(i)
self.driver.close()
self.driver.quit()

class DataBase():

“””数据库类”””
def __init__(self, host=’127.0.0.1′,port=8888,user=’news_5′,passwd=’67675RD1CX’,db=’newsdb’,charset=’utf8′):

self.con = pymysql.connect(host=’127.0.0.1′,port=8888,user=’news_5′,passwd=’67675RD1CX’,db=’newsdb’,charset=’utf8′)

self.cur = self.con.cursor()

# 方法1：

def _reCon (self):

“”” MySQLdb.OperationalError异常”””

# self.con.close()

while True:

try:
self.con.ping()
break

except OperationalError:

self.con.ping(True)

# 方法2：

def _reConn (self,num=28800,stime=3):
“””
校验数据库连接是否异常
num：8小时
stime：间隔3秒重连
“””
_number = 0
_status = True
while _status and _number <= num:
try:
self.con.ping() #cping 校验连接是否异常
_status = False
except:
if self.con == True: #重新连接,成功退出
_status = False
break
_number +=1

time.sleep(stime) #连接不成功,休眠3秒钟,继续循环，直到循环8小时

def insertArticle(self, title,cover,add_time,content):
“””查看标题”””
#self._reCon()
self._reConn()
with self.con:
sql = “select max(article_id) from `article` ”
#print(sql)
self.cur.execute(sql)
article_id =self.cur.fetchone()
article_id = int(article_id[0]) + 1
sql = “insert into `article` (`title`,`channel_id`,`class_id`,`cover`,`author`,`editor_uid`,`editor_name`,`check_time`,`status`,`author_id`,`typedef`,`article_id`) VALUES (‘%s’,’%d’,’%d’,’%s’,’%s’,’%d’,’%s’,’%d’,’%d’,’%d’,’%d’,’%d’)” % (str(title), 9, 66, str(cover), str(‘健康’), 74, str(“医生”), add_time, 1, 7865, 1, article_id)
self.cur.execute(sql)
sql = “insert into `content` (`article_id`,`content`) VALUES (‘%d’,’%s’)” % (article_id, str(content))
# print(sql)
# exit()
self.cur.execute(sql)

def checkTitle(self, title):
“””查看标题”””
#self._reCon()
self._reConn()
with self.con:
sql = “select * from `article` where `title`='” + title + “‘”
#print(sql,self.cur)
self.cur.execute(sql)
return self.cur.fetchone()
if __name__ == ‘__main__’:
bjh = Bjh()
bjh.run(“https://author.baidu.com/home/1554288727768751”)

发表评论 取消回复

发表评论取消回复