前言
初学python时写的爬虫小程序,爬取动漫之家 《镖人》漫画所有文件到本地,单线程爬取,速度一般,如果用多线程会有很大速度提升,抽取爬虫逻辑可爬全站漫画资源
代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
| import os import re import requests from bs4 import BeautifulSoup from tqdm import tqdm
def get_url(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"} res = requests.get(url, headers=headers) res.encoding = res.apparent_encoding if res.status_code == 200: return res.text else: return None
def get_page(html): title_list, href_list = [], []
soups = BeautifulSoup(html, "lxml") soups = soups.find(attrs={"class": "list_con_li autoHeight"})
for soup in soups.select("li a"): title_list.insert(0, soup['title']) href_list.insert(0, soup['href'])
return title_list, href_list
def is_number(s): try: float(s) return True except ValueError: pass
def main_download(name, url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36", "Referer": url} res = requests.get(url, headers=headers) html = res.text link = re.findall("function\(p,a,c,k,e,d\).*?split", html) """解码js,构造图片真实链接""" first_number = link[0].split("'.split") first_number = first_number[0].split("||") links, second = [], [] first = "" for i in first_number: number = i.split("|") for num in number: try: if is_number(num) and len(num) == 4: first = num elif is_number(num) and (len(num) == 5 or len(num) == 6): second.append(int(num)) elif is_number(num) and len(num) >= 7: links.append(num)
except: pass links = sorted(links) for i in links: imgs_link = f'https://images.dmzj.com/img/chapterpic/{first}/{second[0]}/{i}.jpg'
response = requests.get(url=imgs_link, headers=headers) try: with open(f"镖人/{name}/{i}.jpg", 'wb') as f: f.write(response.content) except: pass print(f"{name}: 已经下载完成")
def main(): if not os.path.exists("镖人"): os.mkdir("镖人") url = "https://www.dmzj.com/info/biaoren.html" html = get_url(url) title_list, href_list = get_page(html) for name, url in zip(title_list, href_list,): if not os.path.exists(f"镖人/{name}"): os.mkdir(f"镖人/{name}") main_download(name, url)
if __name__ == '__main__': main()
|