# -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup import urllib import gevent from gevent import Greenlet import socket import random import re, pymysql, threading,os,time from PIL import Image connect = pymysql.Connect( host='localhost', port=3306, user='python', passwd='8eeded', db='python', charset='utf8' ) cursor = connect.cursor() mutex = threading.Lock() user_agent_list = [ \ "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", \ "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko", \ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360SE", \ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", \ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E)", \ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", \ "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/60.0.1163.0 Safari/536.3" ] def cbk(a, b, c): '''''回调函数 ''' per = 100.0 * a * b / c if per > 100: per = 100 print('%.2f%%' % per) def photo_download(photo_thread, index_number, photo_number, number): while number < 3564: try: i = 0 number = number + 1 url = "https://www.pexels.com/search/"+random.choice(dict)+"?page="+str(index_number)+"&format=js&seed=2018-06-27%2002:36:18%20+0000" # print(headers) headers = {'User-Agent': random.choice(user_agent_list)} r = requests.get(url, headers=headers) # 获得目标页面返回信息 cate = url.split('?')[0].split('/')[4] while r.status_code == 404: # 判断响应状态码 i = i + 1 url = "https://www.pexels.com/search/" + random.choice(dict) + "?page=" + str(index_number) + "&format=js&seed=2018-06-27%2002:36:18%20+0000" print(url) else: for link in re.findall(re.compile(r'alt=\\"(.*?)\\" data-big-src=\\"(.*?)"'),r.text): #https://images.pexels.com/photos/713829/pexels-photo-713829.jpeg?auto=compress&cs=tinysrgb&h=750&w=1260\\ #取出入库属性: img_url = link[1].split("?")[0] img_height = link[1].split("?")[1].split('&')[2].split('=')[1] img_width = link[1].split("?")[1].split('&')[3].split('=')[1].replace('\\','') title=link[0] # 输出图片信息 print(img_url,img_height,img_width,title) # 设置超时 socket.setdefaulttimeout(3.0) photo_number = photo_number + 1 img_save = time.strftime('%Y%m%d', time.localtime(time.time()))+img_url[-10:] img_thumb = 'thumb_'+time.strftime('%Y%m%d', time.localtime(time.time()))+img_url[-10:] save(img_url)#图片本地化 save_db(img_url,img_height,img_width,img_save,title,img_thumb,cate)#图片上传数据库 except Exception as e: # print(e) index_number = index_number + 1 index_number = index_number + 1 def save_db(img_url,img_height, img_width, img_save, title,img_thumb,cate): sql = "insert into lcr_cs (`img_url`,`img_height`,`img_width`,`img_save`,`title`,`img_thumb`,`img_cate`) values ('%s','%s','%s','%s','%s','%s','%s')" sql = sql % (img_url,img_height, img_width, img_save,title,img_thumb,cate) print(sql) if mutex.acquire(): try: cursor.execute(sql) connect.commit() except: print(img_url + ":插入失败,记录可能已存在") mutex.release() def save(img_url): #生成原图 res = requests.get(img_url) with open(file + img_url[-10:], 'wb') as f: f.write(res.content) create_thumbnail(os.path.abspath('.')+"\orig\\"+img_url[-10:]) def create_thumbnail(filename): # 生成缩略图 im = Image.open(filename) im.thumbnail(SIZE, Image.ANTIALIAS) base, fname = os.path.split(filename) save_path = os.path.join(base, THUMB_DIRECTORY, 'thumb_'+fname) im.save(save_path) if __name__ == '__main__': # 照片分类 dict = ['face', 'beach', 'family', 'medical', 'young', 'man', 'art', 'healthy', 'kids' ] # 线程 photo_thread = [1, 2] SIZE = (400,300) THUMB_DIRECTORY = os.path.abspath('.')+"\\thumb\\" photo_number = -1 # 下载图片计数器,最大50 file =os.path.abspath('.')+"\orig\\" # 图片的保存地址 thread1 = Greenlet.spawn(photo_download, photo_thread[0], 1, photo_number, 0) thread2 = gevent.spawn(photo_download, photo_thread[1], 300, photo_number, 0) threads = [thread1, thread2] # 阻止所有线程完成 gevent.joinall(threads)