爬取头条美女图片

柯基小短腿 2019-11-14 我要评论

import os,time
import requests
from urllib.parse import urlencode
from urllib.request import urlretrieve
import random

def getPage(offset):
    '''爬取指定url页面信息'''
    #设置headers
    headers = {
        'accept-language': 'zh-CN,zh;q=0.9',
        'content-type': 'application/x-www-form-urlencoded',
        'cookie': '设置cookie',
        'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
        'x-requested-with': 'XMLHttpRequest'
    }
    #设置参数params
    params = {
        'aid':'24',
        'app_name':'web_search',
        'offset':'20',
        'format':'json',
        'keyword':'街拍',
        'autoload':'true',
        'count':'20',
        'en_qc':'1',
        'cur_tab':'1',
        'from':'search_tab',
        'pd':'synthesis',
        'timestamp':'1573737426901'
    }
    #定义url地址
    url = 'https://www.toutiao.com/api/search/content/?' + urlencode(params)
    try:
        response = requests.get(url,headers = headers) #请求地址
        print(response.status_code)
        if response.status_code == 200:
            return response.json()
    except requests.ConnectionError:
        return None

def getImages(json):
    '''解析获取图片信息'''
    data = json.get('data')
    if data:
        for item in data:
            image_list = item.get('image_list')
            title = item.get('title')

            if image_list: #如果为空不再继续执行
                for image in image_list:
                    yield {
                        'image': image.get('url'),
                        'title': title
                    }
            else:
                return None

def saveImage(item):
    '''储存图片'''
    # 处理每组图片的存储路径
    title_1 = item.get('title')
    title = title_1.replace('"',"")
    path = os.path.join("./mypic/",title)

    if not os.path.exists(path):
        os.mkdir(path)

    # 拼装原图和目标图片的路径即名称
    local_image_url = item.get('image')
    aa = local_image_url.find("190",15)
    if aa=='-1':
        image_url = local_image_url.replace("190x124/","")
    else:
        image_url = local_image_url = local_image_url.replace("list/190x124","large")
    print(image_url)
    save_pic = path+"/"+local_image_url.split("/").pop()+".jpg"
    print(save_pic)

    # 使用urllib中urlretrieve直接存储图片
    urlretrieve(image_url,save_pic)

def main(offset):
    ''' 主程序函数，负责调度执行爬虫处理 '''
    json = getPage(offset)
    for item in getImages(json):
        print(item)
        saveImage(item)

# 判断当前执行是否为主程序运行，并遍历调用主函数爬取数据
if __name__ == '__main__':
    #main(0)
    for i in range(5):
        main(offset=i*20)
        time.sleep(1)

python基础-面向过程编程

猜您喜欢

11-14Python openpyxl Excel绘制柱形图
11-14canvas绘制工作流之绘制节点
11-14敲开通往架构师的门
11-14EFK教程 - ElasticSearch高性能高可用架构
11-14canvas线条实践之运动的正方形
11-14高性能Web动画和渲染原理系列（4）“Compositor-Pipeline演讲PPT”学习摘要
11-14Project Euler 59: XOR decryption
11-14jQuery源码分析--为什么在参数列表中传入undefined
11-14详解Kafka Producer
11-14分享一份关于Hadoop2.2.0集群环境搭建文档
11-14拉格朗日插值复习笔记
11-14RocketMQ ACL使用指南
11-14cnblogs侧边栏访客统计小插件
11-14实现 call、apply、bind
11-14ubuntu安装gcc不同的版本
11-14kafka集群安装
11-14我和我的公众号
11-14Spring-Mybatis-SpringMVC三大框架整合

爬取头条美女图片

相关文章

猜您喜欢

今日热门