05-09 03:49 阅读 3330

python协程异步aiohttp爬虫：喵绅士2020最新爬虫

这个已经是全站爬虫了

10.21更新

11.13更新

12.1更新

from pyquery import PyQuery

import aiohttp

import re

import asyncio

from urllib import parse

import urllib.request

from lxml import etree

import os

stopping = False

# BASE_DIR = os.getcwd()

BASE_DIR = 'H:\\'

base_url = 'https://zha.qqhentai.com/'

artist_hot_url = 'https://zha.qqhentai.com/artists/hot/page/'

sem = asyncio.Semaphore(500)

timeout = 15

async def create_dir(path, name):

tar = os.path.exists(path + '\\' + name)

if not tar:

print('new dir !!!!!!!!!', name)

try:

os.mkdir(path + '\\' + name)

except Exception as e:

print('in creating dir: ', e)

async def fetch(url, session):

async with sem:

try:

async with session.get(url) as resp:

# print('url status:{} {}'.format(url, resp.status))

if resp.status in [200, 201]:

data = await resp.text()

return data

except Exception as e:

print(e)

async def fetch_file(url, session):

global timeout

async with sem:

try:

async with session.get(url, timeout=timeout) as resp:

# print('url status:{} {}'.format(url, resp.status))

if resp.status in [200, 201]:

data = await resp.read()

return data

else:

tpe = url.split('.')[-1]

# print(tpe)

if tpe == 'png':

url = url.replace('png', 'jpg')

elif tpe == 'jpg':

url = url.replace('jpg', 'png')

# print(url)

if resp not in [200, 201]:

async with session.get(url, timeout=timeout) as resp2:

# print('url status:{} {}'.format(url, resp2.status))

if resp2.status in [200, 201]:

data = await resp2.read()

return data

except Exception as e:

print(e)

# write down your logic

def extract_urls(html):

urls = []

pq = PyQuery(html)

pq = pq('#thumbnail-container')

for link in pq.items('a'):

url = link.attr('href')

url = base_url+url

# print(url)

urls.append(url)

return urls

def extract_artists_links(html):

pq = PyQuery(html)

pq = pq('#tag-container')

# print(pq)

urls = {}

for link in pq.items('a'):

url = link.attr('href')

name = link.text().split(' (')[0].replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]')

# print(url)

urls.update({name: parse.urljoin(base_url, url)})

print(urls)

return urls

def extract_comics_links(html):

pq = PyQuery(html)

# print(pq)

pq = pq('.container.index-container') # 带空格的属性！

urls_names = {}

for link in pq.items('a'):

url = link.attr('href')

name = link('img').attr('alt')

# print(url, name)

urls_names.update({parse.urljoin(base_url, url): name})

# print(urls_names)

return urls_names

def extract_rec_links(html):

pat = '/g/\d+/'

urls = re.findall(pat, html)

urls_f = []

for url in urls:

url = parse.urljoin(base_url, url)

urls_f.append(url)

print(len(urls), urls)

print(len(urls_f), urls_f)

return urls_f

async def get_jpeg_url(url, session, path):

html = await fetch(url, session)

try:

tree = etree.HTML(html)

real_url = tree.xpath('//section[@id="image-container"]/a/img/@src')

await get_jpeg(real_url[0], session, path)

except Exception as e:

print(e)

async def get_jpeg(url, session, path):

global timeout

num = url.split('/')[-1]

path_t = path+'\\'+num

tar = os.path.exists(path_t)

# print(url)

if not tar:

data = await fetch_file(url, session)

if data:

# print(type(data))

with open(path_t, 'wb') as f:

print(path_t, timeout)

f.write(data)

else:

print('fail on page ', url, 'timeout', timeout)

async def comic_handler(url, path_o, session):

global timeout

html = await fetch(url, session)

pq = PyQuery(html)

pq = pq('#info')

# print(pq)

for i in range(10):

name = pq('h'+str(i)).text()

if name:

break

pat = '<div>.*?(\d+).*?</div>'

page = re.findall(pat, str(pq))[0]

# print(page)

name = str(name).replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]')

if name == '':

name = 'None'

print(path_o.split('\\')[-1])

name = re.compile('1-\d+').sub('1-x', name)

name = re.compile('1~\d+').sub('1~x', name)

# 建立目标文件夹

await create_dir(path_o, name)

path = path_o+'\\'+name

print(name, url, path)

'''html-detail'''

with open(path + '\\html.txt', 'w', encoding='utf-8') as f:

f.write(str(pq))

rec = os.listdir(path)

try:

del rec[rec.index('html.txt')]

except Exception as e:

print(e)

for item in rec:

rec[rec.index(item)] = item.split('.')[0]

print(sorted(list(map(int, rec))), int(page), len(rec), int(page)-len(rec))

if int(page)-len(rec) >= 50:

timeout = 60

print('timeout', timeout)

else:

timeout = 15

print('timeout', timeout)

if str(len(rec)) == page:

return

else:

urls_t = extract_urls(html)

tasks1 = []

for item in urls_t:

p = item.split('/')[-2]

# print(p, item)

if p in rec:

# print('in')

continue

tasks1.append(get_jpeg_url(item, session, path))

await asyncio.gather(*tasks1)

async def comic_handler_r(url, root_path, session):

global timeout

html = await fetch(url, session)

pat = '/artist/.*?>(.*?) <'

arti = re.findall(pat, html)

try:

arti = arti[0].replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]')

except Exception as e:

print('in comic handler r:', e)

arti = 'no_name'

if arti and arti != 'con':

await create_dir(root_path, arti)

path = root_path+'\\'+arti

else:

await create_dir(root_path, 'no_name')

path = root_path+'\\no_name'

await comic_handler(url, path, session)

import time

async def page_handler(comics_urls, arti_path, session):

for comic_link in comics_urls: # choose one of his work

print(time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())))

# try:

await comic_handler(comic_link, arti_path, session)

# except Exception as e:

# print(e)

async def unique_comic_names(arti_base_link, session):

i = 0

urls_names = {}

while True:

try:

html = await fetch(arti_base_link+'chinese/page/'+str(i), session)

urls_t_names_t = extract_comics_links(html)

urls_names.update(urls_t_names_t)

i += 1

except Exception as e:

print('in unique', e)

break

return len(list(set([s.lower().replace(' ', '') for s in urls_names.values()]))), urls_names

async def check_arti(num, arti_path, urls_names, session):

rec = os.listdir(arti_path)

print('need:', num, 'now comics:', len(rec))

if len(rec) == 0:

return False

if len(rec) >= num:

for name in rec:

comic_path = arti_path+'\\'+name

recc = os.listdir(comic_path)

with open(comic_path+'\\html.txt', 'r', encoding='utf8') as f:

t = f.read()

pat = '共 (.*?) 頁'

patt = '<h\d>(.*?)</h\d>'

try:

k = int(re.findall(pattern=pat, string=t)[0])

except Exception as e:

print('in', name, e)

k = int(re.findall(pattern='共(.*?)頁', string=t)[0])

kk = re.findall(pattern=patt, string=t)

print('need:', k, 'now:', len(recc)-1, name)

f = 0

if len(recc)-1 < k:

for n in kk:

if f:

break

for u, nn in urls_names.items():

if f:

break

if n == nn:

print(time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())))

await comic_handler(u, arti_path, session)

f += 1

print(f)

if not f:

return False

else:

return False

return True

async def main(start):

async with aiohttp.ClientSession() as session:

'''

选择爬取模式

'''

'''get-all-site`s-work'''

# lock = Lock()

# for i in range(331682, 1, -1):

# await lock.acquire()

# await comic_handler('https://nyahentai.club/g/'+str(i)+'/', session)

# lock.release()

'''only-one'''

# await comic_handler('https://zh.qqhentai.com/g/333292/', BASE_DIR, session)

'''get specific artist'''

# arti_name = 'ratatatat74'

# arti_base_link = 'https://zh.yyhentai.com/artist/ratatatat74-mr-skull/chinese/'

# await create_dir(BASE_DIR, 'all_co')

# root_path = BASE_DIR + '\\all_co'

# await create_dir(root_path, arti_name)

# arti_path = root_path + '\\' + arti_name

# i = 1

# while (True): # choose one of his page

# try:

# html_all_comics = await fetch(arti_base_link + 'page/' + str(i), session)

# print(arti_base_link + 'page/' + str(i))

# comics_urls = extract_comics_links(html_all_comics)

# await page_handler(comics_urls, arti_path, session)

# # for comic_link in comics_urls: # choose one of his work

# # await comic_handler(comic_link, arti_path, session)

# i += 1

# except Exception as e:

# print('fail or end on artist', arti_name, e)

# break

'''get recent ch comics'''

await create_dir(BASE_DIR, 'all_co')

root_path = BASE_DIR + '\\all_co'

for j in range(1, 1000):

print('pg:', j)

rec_html = await fetch(base_url+'language/chinese/page/'+str(j), session)

comic_urls = extract_rec_links(rec_html)

for url in comic_urls:

await comic_handler_r(url, root_path, session)

# '''get-all-site`s-hot-artists-work'''

# await create_dir(BASE_DIR, 'all_co')

# root_path = BASE_DIR+'\\all_co'

# for j in range(15, 180): # artists page

# print(j)

# html_all_artists = await fetch(artist_hot_url + str(j), session)

# artists_base_urls = extract_artists_links(html_all_artists) # one page

# for arti_name, arti_base_link in artists_base_urls.items(): # choose one artist

# await create_dir(root_path, arti_name)

# arti_path = root_path+'\\'+arti_name

# # try:

# k, comics_urls_names = await unique_comic_names(arti_base_link, session)

# print(arti_name, 'artist works', k)

# if await check_arti(int(k), arti_path, comics_urls_names, session):

# print('next artist===============================================================')

# continue

# await page_handler(comics_urls_names.keys(), arti_path, session)

# print('next artist===============================================================')

# # except Exception as e:

# # print('fail or end on artist', arti_name, e)

# # break

if __name__ == "__main__":

# print(BASE_DIR)

loop = asyncio.get_event_loop()

loop.run_until_complete(main(loop))

# tha_main()

# =========================================================

————————————————

原文链接：https://blog.csdn.net/lafea/article/details/108957383