70行代码爬取 查字典_笑话(多线程)

软件发布|下载排行|最新软件

当前位置:首页IT学院IT技术

70行代码爬取 查字典_笑话(多线程)

小毅i   2020-02-19 我要评论
 1 from queue import Queue, Empty
 2 import threading
 3 import requests
 4 from pyquery import PyQuery
 5 import time
 6 
 7 index = 'https://www.chazidian.com'
 8 list_page = index + '/xiaohua{}/{}'
 9 
10 headers = {
11     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
12 }
13 timeout = 1
14 thread_quantity = 5
15 
16 
17 def get_url(queue):
18     for i in range(1, 75 + 1):
19         r = requests.get(list_page.format('', str(i)), headers=headers)
20         pq = PyQuery(r.text)
21         doc = pq('div.arctcot h3 a')
22         for j in doc.items():
23             queue.put(index + j.attr('href'))
24     # print(doc)
25     pass
26 
27 
28 def get_content(queue):
29     try:
30         while True:
31             url = queue.get(timeout=timeout)
32             if 'https://' in url:
33                 r = requests.get(url, headers=headers)
34                 pq = PyQuery(r.text)
35                 doc = pq('div.arctcot')
36                 title = doc('a').text()
37                 content = doc('div.article_detail').text()
38                 img = doc('div.article_detail img').attr('src')
39                 # print(title.text())
40                 # print(content.text())
41                 if title and img:
42                     if not 'http://' in img:
43                         print(url)
44                         print({title: index + img})
45                 elif title and content:
46                     if not (content in title):
47                         print(url)
48                         print({title: content})
49     except Empty:
50         print('-' * 100)
51         print('抓取完毕')
52 
53 
54 def main():
55     queue_ = Queue(maxsize=1000)
56     list_ = threading.Thread(target=get_url, args=(queue_,))
57     list_.start()
58     if True:
59         for i in range(thread_quantity):
60             content = threading.Thread(target=get_content, args=(queue_,))
61             content.start()
62             content.join()
63     list_.join()
64 
65 
66 if __name__ == '__main__':
67     start = time.time()
68     main()
69     end = time.time()
70     print('用时: ', end - start - timeout)

 

Copyright 2022 版权所有 软件发布 访问手机版

声明:所有软件和文章来自软件开发商或者作者 如有异议 请与本站联系 联系我们