【scrapy】 1.使用scrapy框架高效爬取汽车之家图片

使用scrapy框架完成图片的爬取

一、项目基础配置

创建项目

1
2
3
scrapy startproject BmwImages
cd BmwImages
scrapy genspider bmwSpider car.autohome.com.cn

修改基础配置

1
2
3
* settings中ROBOTSTXT_OBEY
* DEFAULT_REQUEST_HEADERS
* USER_AGENT

增加start.py启动爬虫文件

1
2
from scrapy import cmdline
cmdline.execute("scrapy crawl bmwSpider".split())

二、爬取图片数据

获取图片类别名称和urls列表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import scrapy
from urllib import parse
from BmwImages.items import BmwimagesItem

class BmwspiderSpider(scrapy.Spider):
name = 'bmwSpider'
allowed_domains = ['car.autohome.com.cn']
start_urls = ['https://car.autohome.com.cn/pic/series/66.html']

def parse(self, response):
uiboxs = response.xpath("//div[@class='uibox']")[1:]
for uibox in uiboxs:
category = uibox.xpath(".//div[@class='uibox-title']/a/text()").get()
urls = uibox.xpath(".//div//ul/li/a/img/@src").getall()
image_urls = list(map(lambda x:parse.urljoin(response.url,x.replace("240x180_0_q95_c42_","")),urls))
item = BmwimagesItem(category=category,image_urls=image_urls)
yield item

设置图片管道下载

1
2
3
4
5
6
import scrapy

class BmwimagesItem(scrapy.Item):
category = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()

settings.py代码

1
2
3
4
5
6
7
8
import os

ITEM_PIPELINES = {
# 'BmwImages.pipelines.BmwimagesPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline': 1
}

IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')

重写方法设置目录

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import os
from scrapy.pipelines.images import ImagesPipeline
from BmwImages import settings

class BmwImagesPipeline(ImagesPipeline):

# 请求下载之前调用 发送下载请求
def get_media_requests(self, item, info):
request_objs = super(BmwImagesPipeline,self).get_media_requests(item, info)
for request_obj in request_objs:
request_obj.item = item
return request_objs

# 请求之后调用 图片将要被存储时调用获取路径
def file_path(self, request, response=None, info=None):
path = super(BmwImagesPipeline, self).file_path(request, response, info)
category = request.item.get('category')
image_store = settings.IMAGES_STORE
category_path = os.path.join(image_store,category)
if not os.path.exists(category_path):
os.mkdir(category_path)

image_name = path.replace("full/","")
image_path = os.path.join(category_path,image_name)
return image_path

使用crawlSpider进行改进,爬取更多图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
from BmwImages.items import BmwimagesItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

class BmwspiderSpider(CrawlSpider):
name = 'bmwCrawlSpider'
allowed_domains = ['car.autohome.com.cn']
start_urls = ['https://car.autohome.com.cn/pic/series/66.html']

rules = {
Rule(LinkExtractor(allow=r"https://car.autohome.com.cn/pic/series/66.+"),callback="parse_images",follow=True),
}

def parse_images(self,response):
category = response.xpath("//div[@class='uibox-title']/text()").getall()
category = "".join(category).split()
srcs = response.xpath("//div[contains(@class,'uibox-con')]/ul/li//img/@src").getall()
srcs = list(map(lambda x:parse.urljoin(response.url,x.replace("240x180_0_q95_c42_","1024x0_1_q95_")),srcs))
if len(category) != 0:
yield BmwimagesItem(category=category[0], image_urls=srcs)