
2.1 F12打开浏览器开发者模式,使用抓包工具分析网页。
2.2 浏览分析网页,发现先向下滚动时,会向服务器发送请求获取新图片。
2.3 对比两个请求,请求中共有三个参数:cid、start、count。猜测cid为图片分类、start为获取图片的页码、count为获取图片的数量。发送请求验证猜想。
猜想基本正确,请求返回数据中共有30条数据,每条数据含有图片资源地址,图片资源地址保存在rdata当中。
2.4 分析完成,编写爬虫批量下载图片。
import os
import time
from multiprocessing.dummy import Pool
import requests
count = 0
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36',
}
image_size_url = [
'http://image.baidu.com/search/down?tn=download&word=download&ie=utf8&fr=detail&url=http://browser9.qhimg.com/bdm/2560_1600_100/',
'http://image.baidu.com/search/down?tn=download&word=download&ie=utf8&fr=detail&url=http://browser9.qhimg.com/bdm/1440_900_100/',
'http://image.baidu.com/search/down?tn=download&word=download&ie=utf8&fr=detail&url=http://browser9.qhimg.com/bdm/1024_768_100/',
'http://image.baidu.com/search/down?tn=download&word=download&ie=utf8&fr=detail&url=http://browser9.qhimg.com/bdm/800_600_100/',
'http://image.baidu.com/search/down?tn=download&word=download&ie=utf8&fr=detail&url=http://browser9.qhimg.com/bdm/0_0_100/'
]
base_fileName = '兔二工具'
# 使用cid、start、count拼接获取json数据的URL
def getUrl(cid, start, count):
return 'https://www.toer2.com/inHtml/PCwallpaper/api.php?cid=' + str(cid) + '&start=' + str(
start) + '&count=' + str(count)
# 使用URL获取json数据
def getJsonData(url):
return requests.get(url, headers=headers).json()
# 下载图片
def getImage(url, path):
image = requests.get(url, headers=headers)
with open(path, 'wb') as f:
f.write(image.content)
f.flush()
f.close()
# 处理返回的json数据
def delJsonData(jsonData):
real_data = []
data = jsonData['data']
for one in data:
image_message = {
'image_id': one['url'].split('/')[-1], # 图片id
'dic': one['tag'] + one['id'], # 图片目录名
'image_name': ['2560_1600_100', '1440_900_100', '1024_768_100', '800_600_100', '0_0_100']
}
real_data.append(image_message)
return real_data
# 批量下载图片
def downloadImage(data):
global count
count = 1
for temp in data:
if not os.path.exists(base_fileName + temp['dic']):
os.makedirs(base_fileName + temp['dic'])
print('正在下载第 ' + str(count) + ' 组图片:')
count += 1
num = 1
for base_url in image_size_url:
print(' 下载第 ' + str(count - 1) + ' 组' + '第 ' + str(num) + ' 张图片......')
num += 1
download_url = base_url + temp['image_id']
ima_name = base_fileName + temp['dic'] + '/' + base_url.split('/')[-2] + '.jpg'
getImage(download_url, ima_name)
def downloadImageThread(one):
global count
if not os.path.exists(base_fileName + one['dic']):
os.makedirs(base_fileName + one['dic'])
num = 1
for base_url in image_size_url:
# global count
print('下载第 ' + str(count) + ' 组' + '第 ' + str(num) + ' 张图片......n')
num += 1
download_url = base_url + one['image_id']
ima_name = base_fileName + one['dic'] + '/' + base_url.split('/')[-2] + '.jpg'
getImage(download_url, ima_name)
count += 1
start_time = time.time()
# 使用线程池爬取
pool = Pool(5)
# 多线程爬取
# pool.map(downloadImageThread,delJsonData(getJsonData(getUrl(6,0,60))))
# 单线程爬取
# downloadImage(delJsonData(getJsonData(getUrl(6,0,30))))
# 单线程批量爬取
for i in range(0, 500, 30):
downloadImage(delJsonData(getJsonData(getUrl(6, i, 30))))
# 多线程批量爬取
# for i in range(0,300,60):
# pool.map(downloadImageThread, delJsonData(getJsonData(getUrl(6, i, 60))))
end_time = time.time()
print('爬取成功!', end_time - start_time)
4. 效果展示
4.1 爬取页面
4.2 爬取的图片