
继上次爬取了博主的文章列表后,再做下载保存文章。搜索引擎 查询后,果然没有python办不到的事,借鉴代码后。增加了,对文章中的图片下载至本地为什么不直接CV呢?那还是爬取比CV快呀~傻瓜
文章地址:爬取CSDN博主文章列表,练习
源码import parsel, tomd, re
# pip install tomd -i https://pypi.tuna.tsinghua.edu.cn/simple
from module_path import util, logging
# 把文章保存为markdown
def download_article(url):
try:
html = util.get_html(url)
# 把网页变成css选择器,解析网页
sel = parsel.Selector(html)
# 标题,内容
title = sel.css('.title-article::text').get()
content = sel.css('article').get()
# 提取文章的内容与格式
text = tomd.Tomd(content).markdown
text = re.sub('', "", text)
# 获取后,标题中带有..的去掉
text = re.sub('', "", text)
"""
将图片下载 下来,替换文章中的csdn链接
"""
imgs = re.findall('', text)
root_path = util.JarProjectPath.project_root_path()
# 创建文件夹
folder = util.mkdir(path='%s/files/%s'%(root_path, title))
for i in range(len(imgs)):
# 获取图片地址
url = re.search(r'.*src="http://www.iotsi.net/skin/sinaskin/image/nopic.gif"', imgs[i]).group(1)
# print(imgs[i])
# 下载图片,返回本地路径
filepath = util.download_img(url, filename='%s_%d'%(title, i), filedir=folder)
# 替换文章中的链接
text = text.replace(imgs[i], ''%filepath)
# 保存文章路径
filename = '%s/files/%s.md'%(root_path, title)
# 保存
with open(filename, mode='w', encoding='utf-8') as f:
f.write("#" + title)
f.write(text)
except Exception as e:
logging.debug(e)
print('失败>%s'%url)
if __name__ == '__main__':
# 博客地址url,问号后的参数可以都删掉
url = 'https://blog.csdn.net/qq_39454665/article/details/120507437'
download_article(url)
其中,使用tomd 获取后,标题中会多出...的内容,再用正则替换掉,或者打开文件手动删除
# 提取文章的内容与格式
text = tomd.Tomd(content).markdown
text = re.sub('', "", text)
# 获取后,标题中带有..的去掉
text = re.sub('', "", text)
源代码中有
不处理,保存后,影响MD格式之美
# 获取请求头
def get_headers(localhost=True, refer="https://www.baidu.com", host=None):
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
if not localhost:
uas = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Sosospider+(+http://help.soso.com/webspider.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua = random.choice(uas)
headers = {
"User-Agent": ua,
"Referer": refer,
"Host": host
}
return headers
# 获取html
def get_html(url, ret_type="text", timeout=50, encoding="utf-8"):
headers = get_headers()
res = requests.get(url, headers=headers, timeout=timeout)
res.encoding = encoding
# print(res.status_code)
# print(res.text)
if ret_type == "text":
return res.text
elif ret_type == "image":
return res.content
elif ret_type == "json":
return res.json()
# 创建文件夹
def mkdir(path=os.path.join(__dir__, '../images/%s'%str(round(time.time() * 1000)) + '/')):
"""
新建图片文件夹
:param path: The file path - 文件路径
"""
ext = os.path.exists(path)
#判断是否存在文件夹如果不存在则创建为文件夹
if not ext:
os.makedirs(path)
print ("--- new folder... ---", path)
#else:
#print ("--- There is this folder! ---")
return '%s/'%path
def download_img(src, filename=str(round(time.time() * 1000)), filedir=os.path.join(__dir__, '../images/'), domain=None):
"""
图片下载
:param src: The image http url - 图片链接
:param filename: file name - 名称。默认当前时间戳
:param filedir: file saved dir - 保存目录。默认当前文件,文件夹images
:param domain: website domain - 图片前缀域名
"""
if domain is not None:
src = domain + src
pic = requests.get(src, timeout=20)
# 去掉链接后面的参数
src = src.split('?')[0]
filepath = filedir + filename + src[-4:]
if pic.status_code == 200:
with open(filepath, 'wb') as f:
f.write(pic.content)
return filepath
# 获取当前项目根路径
class JarProjectPath:
@staticmethod
def project_root_path(project_name=None):
"""
获取当前项目根路径
:param project_name:
:return: 根路径
"""
PROJECT_NAME = 'py' if project_name is None else project_name
project_path = os.path.abspath(os.path.dirname(__file__))
root_path = project_path[:project_path.find("{}\".format(PROJECT_NAME)) + len("{}\".format(PROJECT_NAME))]
# print('当前项目名称:{}rn当前项目根路径:{}'.format(PROJECT_NAME, root_path))
return root_path
其他说明!!! 获取项目根路径,记得把方法里的默认py换成你的项目名称~~PROJECT_NAME
该py文件在 我的某一个文件夹sqc下,然后工具类又在另一个文件夹common下,这会使工具类不能直接在改文件中import util,
找不到这个util模块
PS C:UsersdyjxDesktoppy> & D:/Python/Python39/python.exe c:/Users/dyjx/Desktop/py/sqc/download_csdn_article.py Traceback (most recent call last): File "c:UsersdyjxDesktoppysqcdownload_csdn_article.py", line 6, in你得需将这个文件夹加入搜索索引,这样之后。import util,才能找到import util ModuleNotFoundError: No module named 'util'
import sys, os __dir__ = os.path.dirname(os.path.abspath(__file__)) sys.path.append(__dir__) sys.path.append(os.path.abspath(os.path.join(__dir__, './common'))) import util import logging logging.basicConfig(filename = "out.txt",level=logging.DEBUG,format= "%(asctime)s %(levelname)s -- %(message)s")所以,我又将上面这几行代码,又给整合到与两者文件夹同级的一个文件里module_path.py
再直接引用module_path.py,减少了每次复制上面的这几行代码,嘻嘻
from module_path import util, logging
保存后,好像 有一些小地方不对,再优化。比如 序号格式?第一行的标题