博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
今日头条街拍
阅读量:4361 次
发布时间:2019-06-07

本文共 3279 字,大约阅读时间需要 10 分钟。

spider.py

import jsonimport osfrom urllib.parse import urlencodeimport pymongoimport requestsfrom bs4 import BeautifulSoupfrom requests.exceptions import ConnectionErrorimport refrom multiprocessing import Poolfrom hashlib import md5from json.decoder import JSONDecodeErrorfrom config import *client = pymongo.MongoClient(MONGO_URL, connect=False)db = client[MONGO_DB]def get_page_index(offset, keyword):    data = {        'autoload': 'true',        'count': 20,        'cur_tab': 3,        'format': 'json',        'keyword': keyword,        'offset': offset,    }    params = urlencode(data)    base = 'http://www.toutiao.com/search_content/'    url = base + '?' + params    try:        response = requests.get(url)        if response.status_code == 200:            return response.text        return None    except ConnectionError:        print('Error occurred')        return Nonedef download_image(url):    print('Downloading', url)    try:        response = requests.get(url)        if response.status_code == 200:            save_image(response.content)        return None    except ConnectionError:        return Nonedef save_image(content):    file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')    print(file_path)    if not os.path.exists(file_path):        with open(file_path, 'wb') as f:            f.write(content)            f.close()def parse_page_index(text):    try:        data = json.loads(text)        if data and 'data' in data.keys():            for item in data.get('data'):                yield item.get('article_url')    except JSONDecodeError:        passdef get_page_detail(url):    try:        response = requests.get(url)        if response.status_code == 200:            return response.text        return None    except ConnectionError:        print('Error occurred')        return Nonedef parse_page_detail(html, url):    soup = BeautifulSoup(html, 'lxml')    result = soup.select('title')    title = result[0].get_text() if result else ''    images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)    result = re.search(images_pattern, html)    if result:        data = json.loads(result.group(1).replace('\\', ''))        if data and 'sub_images' in data.keys():            sub_images = data.get('sub_images')            images = [item.get('url') for item in sub_images]            for image in images: download_image(image)            return {                'title': title,                'url': url,                'images': images            }def save_to_mongo(result):    if db[MONGO_TABLE].insert(result):        print('Successfully Saved to Mongo', result)        return True    return Falsedef main(offset):    text = get_page_index(offset, KEYWORD)    urls = parse_page_index(text)    for url in urls:        html = get_page_detail(url)        result = parse_page_detail(html, url)        if result: save_to_mongo(result)if __name__ == '__main__':    pool = Pool()    groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])    pool.map(main, groups)    pool.close()    pool.join()

config.py

MONGO_URL = 'localhost'MONGO_DB = 'toutiao'MONGO_TABLE = 'toutiao'GROUP_START = 1GROUP_END = 20KEYWORD='街拍'

来源于微信公众号: 进击的Coder (ID:FightingCoder)

转载于:https://www.cnblogs.com/hankleo/p/11489793.html

你可能感兴趣的文章
Django模板系统
查看>>
位(Bit)与字节(Byte)
查看>>
关于两次指针(struct型)传参数的问题
查看>>
在Logstash的配置文件中对日志事件进行区分
查看>>
字符串之strcmp
查看>>
Android使用Fragment程序崩溃
查看>>
codevs 2822 爱在心中(强连通分量)
查看>>
七:python 对象类型详解三:列表
查看>>
c语言基本数据类型相关
查看>>
SQL Server DATEADD() 函数
查看>>
makefile中的wildcard和patsubst
查看>>
F#基础教程 mutable关键字
查看>>
完全卸载MySQL数据库
查看>>
C#总结项目《影院售票系统》编写总结一
查看>>
Failed to stop iptables.service: Unit iptables.service not loaded.
查看>>
madpaly 移植到 TQ2440 遇到问题madplay not found (2)
查看>>
LISTVIEW显示JPEG缩略图
查看>>
YII中引用自定义类
查看>>
Unity 基础
查看>>
Python字符串切片操作知识详解
查看>>