爬取头条美女图片

软件发布|下载排行|最新软件

当前位置:首页IT学院IT技术

爬取头条美女图片

柯基小短腿   2019-11-14 我要评论
import os,time
import requests
from urllib.parse import urlencode
from urllib.request import urlretrieve
import random

def getPage(offset):
'''爬取指定url页面信息'''
#设置headers
headers = {
'accept-language': 'zh-CN,zh;q=0.9',
'content-type': 'application/x-www-form-urlencoded',
'cookie': '设置cookie',
'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
#设置参数params
params = {
'aid':'24',
'app_name':'web_search',
'offset':'20',
'format':'json',
'keyword':'街拍',
'autoload':'true',
'count':'20',
'en_qc':'1',
'cur_tab':'1',
'from':'search_tab',
'pd':'synthesis',
'timestamp':'1573737426901'
}
#定义url地址
url = 'https://www.toutiao.com/api/search/content/?' + urlencode(params)
try:
response = requests.get(url,headers = headers) #请求地址
print(response.status_code)
if response.status_code == 200:
return response.json()
except requests.ConnectionError:
return None

def getImages(json):
'''解析获取图片信息'''
data = json.get('data')
if data:
for item in data:
image_list = item.get('image_list')
title = item.get('title')

if image_list: #如果为空不再继续执行
for image in image_list:
yield {
'image': image.get('url'),
'title': title
}
else:
return None

def saveImage(item):
'''储存图片'''
# 处理每组图片的存储路径
title_1 = item.get('title')
title = title_1.replace('"',"")
path = os.path.join("./mypic/",title)

if not os.path.exists(path):
os.mkdir(path)

# 拼装原图和目标图片的路径即名称
local_image_url = item.get('image')
aa = local_image_url.find("190",15)
if aa=='-1':
image_url = local_image_url.replace("190x124/","")
else:
image_url = local_image_url = local_image_url.replace("list/190x124","large")
print(image_url)
save_pic = path+"/"+local_image_url.split("/").pop()+".jpg"
print(save_pic)

# 使用urllib中urlretrieve直接存储图片
urlretrieve(image_url,save_pic)

def main(offset):
''' 主程序函数,负责调度执行爬虫处理 '''
json = getPage(offset)
for item in getImages(json):
print(item)
saveImage(item)

# 判断当前执行是否为主程序运行,并遍历调用主函数爬取数据
if __name__ == '__main__':
#main(0)
for i in range(5):
main(offset=i*20)
time.sleep(1)



Copyright 2022 版权所有 软件发布 访问手机版

声明:所有软件和文章来自软件开发商或者作者 如有异议 请与本站联系 联系我们