【scrapy】 2.scrapy进阶学习之随机请求头、随机代理和数据异步存储

使用scrapy框架完成图片的爬取

一、设置随机请求头中间件

修改middlewares.py中间件内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import random

# 编写(重写)download中间件
class BmwimagesDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.

USER_AGENT = {
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)',
'Chrome (AppleWebKit/537.1; Chrome50.0; Windows NT 6.3) AppleWebKit/537.36 (KHTML like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
'Cyberdog/2.0 (Macintosh; 68k)'
}

def process_request(self, request, spider):

user_agent = random.choice(self.USER_AGENT)
request.headers['User-Agent'] = user_agent

return None

settings.py开启下载中间件

1
2
3
DOWNLOADER_MIDDLEWARES = {
'BmwImages.middlewares.BmwimagesDownloaderMiddleware': 543,
}

二、设置随机代理中间件

修改middlewares.py中间件内容

1
2
3
4
5
6
7
8
import random
def process_request(self, request, spider):

# 设置IP代理中间件
# 如果购买的是代理ip,会得到代理ip及端口列表
proxy_list = []
proxy = random.choice(proxy_list)
request.meta['proxy'] = proxy

settings.py开启下载中间件

1
2
3
DOWNLOADER_MIDDLEWARES = {
'BmwImages.middlewares.BmwimagesDownloaderMiddleware': 543,
}

爬取数据存入数据库

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import pymysql

class JsPipeline(object):
def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': 'root',
'database': 'jianshu',
'charset': 'utf8'
}
self.conn = pymysql.connect(**dbparams)
self.cursor = self.conn.cursor()

def process_item(self, item, spider):
sql = "INSERT INTO `article` (`title`) VALUES ('%s')"%item['title']
self.cursor.execute(sql)
self.conn.commit()
return item

修改成爬取异步存储

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from pymysql import cursors
from twisted.enterprise import adbapi

class JsTwistedPipeline(object):

def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': 'root',
'database': 'jianshu',
'charset': 'utf8',
'cursorclass':cursors.DictCursor
}
self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)

def process_item(self, item, spider):
sql = "INSERT INTO `article` (`title`) VALUES ('%s')"%item['title']
defer = self.dbpool.runInteraction(self.insert_item,item)
defer.addErrback(self.handle_error,item,spider)
return item

def insert_item(self,cursor,item):
sql = "INSERT INTO `article` (`title`) VALUES ('%s')"%item['title']
cursor.execute(sql)

def handle_error(self,error,item,spider):
print('='*10+"error"+"="*10)
print(error)
print('='*10+"error"+"="*10)

使用selenium+webdriver爬取ajax请求的数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from selenium import webdriver
from scrapy.http import HtmlResponse
import time

class JsSeleniumWebDriverMiddleware(object):

def __init__(self):
driver_path = r"D:\python\scrapy\study\20191009\chromedriver\chromedriver2-0.exe"
self.driver = webdriver.Chrome(executable_path=driver_path)

def process_request(self, request, spider):
self.driver.get(request.url)
time.sleep(1)
try:
while True:
showmore = self.driver.find_element_by_xpath("更多")
showmore.click()
time.sleep(0.3)
if not showmore:
break
except:
pass

source = self.driver.page_source
response = HtmlResponse(url=self.driver.current_url,body=source,request=request,encoding='utf-8')
return response

开启selenium+webdriver中间件

1
2
3
DOWNLOADER_MIDDLEWARES = {
'BmwImages.middlewares.JsSeleniumWebDriverMiddleware': 543,
}