python协程异步aiohttp爬虫:喵绅士2020最新爬虫
python协程异步aiohttp爬虫:喵绅士2020最新爬虫
这个已经是全站爬虫了
10.21更新
11.13更新
12.1更新
from pyquery import PyQuery
import aiohttp
import re
import asyncio
from urllib import parse
import urllib.request
from lxml import etree
import os
stopping = False
# BASE_DIR = os.getcwd()
BASE_DIR = 'H:\\'
base_url = 'https://zha.qqhentai.com/'
artist_hot_url = 'https://zha.qqhentai.com/artists/hot/page/'
sem = asyncio.Semaphore(500)
timeout = 15
async def create_dir(path, name):
tar = os.path.exists(path + '\\' + name)
if not tar:
print('new dir !!!!!!!!!', name)
try:
os.mkdir(path + '\\' + name)
except Exception as e:
print('in creating dir: ', e)
async def fetch(url, session):
async with sem:
try:
async with session.get(url) as resp:
# print('url status:{} {}'.format(url, resp.status))
if resp.status in [200, 201]:
data = await resp.text()
return data
except Exception as e:
print(e)
async def fetch_file(url, session):
global timeout
async with sem:
try:
async with session.get(url, timeout=timeout) as resp:
# print('url status:{} {}'.format(url, resp.status))
if resp.status in [200, 201]:
data = await resp.read()
return data
else:
tpe = url.split('.')[-1]
# print(tpe)
if tpe == 'png':
url = url.replace('png', 'jpg')
elif tpe == 'jpg':
url = url.replace('jpg', 'png')
# print(url)
if resp not in [200, 201]:
async with session.get(url, timeout=timeout) as resp2:
# print('url status:{} {}'.format(url, resp2.status))
if resp2.status in [200, 201]:
data = await resp2.read()
return data
except Exception as e:
print(e)
# write down your logic
def extract_urls(html):
urls = []
pq = PyQuery(html)
pq = pq('#thumbnail-container')
for link in pq.items('a'):
url = link.attr('href')
url = base_url+url
# print(url)
urls.append(url)
return urls
def extract_artists_links(html):
pq = PyQuery(html)
pq = pq('#tag-container')
# print(pq)
urls = {}
for link in pq.items('a'):
url = link.attr('href')
name = link.text().split(' (')[0].replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]')
# print(url)
urls.update({name: parse.urljoin(base_url, url)})
print(urls)
return urls
def extract_comics_links(html):
pq = PyQuery(html)
# print(pq)
pq = pq('.container.index-container') # 带空格的属性!
urls_names = {}
for link in pq.items('a'):
url = link.attr('href')
name = link('img').attr('alt')
# print(url, name)
urls_names.update({parse.urljoin(base_url, url): name})
# print(urls_names)
return urls_names
def extract_rec_links(html):
pat = '/g/\d+/'
urls = re.findall(pat, html)
urls_f = []
for url in urls:
url = parse.urljoin(base_url, url)
urls_f.append(url)
print(len(urls), urls)
print(len(urls_f), urls_f)
return urls_f
async def get_jpeg_url(url, session, path):
html = await fetch(url, session)
try:
tree = etree.HTML(html)
real_url = tree.xpath('//section[@id="image-container"]/a/img/@src')
await get_jpeg(real_url[0], session, path)
except Exception as e:
print(e)
async def get_jpeg(url, session, path):
global timeout
num = url.split('/')[-1]
path_t = path+'\\'+num
tar = os.path.exists(path_t)
# print(url)
if not tar:
data = await fetch_file(url, session)
if data:
# print(type(data))
with open(path_t, 'wb') as f:
print(path_t, timeout)
f.write(data)
else:
print('fail on page ', url, 'timeout', timeout)
async def comic_handler(url, path_o, session):
global timeout
html = await fetch(url, session)
pq = PyQuery(html)
pq = pq('#info')
# print(pq)
for i in range(10):
name = pq('h'+str(i)).text()
if name:
break
pat = '<div>.*?(\d+).*?</div>'
page = re.findall(pat, str(pq))[0]
# print(page)
name = str(name).replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]')
if name == '':
name = 'None'
print(path_o.split('\\')[-1])
name = re.compile('1-\d+').sub('1-x', name)
name = re.compile('1~\d+').sub('1~x', name)
# 建立目标文件夹
await create_dir(path_o, name)
path = path_o+'\\'+name
print(name, url, path)
'''html-detail'''
with open(path + '\\html.txt', 'w', encoding='utf-8') as f:
f.write(str(pq))
rec = os.listdir(path)
try:
del rec[rec.index('html.txt')]
except Exception as e:
print(e)
for item in rec:
rec[rec.index(item)] = item.split('.')[0]
print(sorted(list(map(int, rec))), int(page), len(rec), int(page)-len(rec))
if int(page)-len(rec) >= 50:
timeout = 60
print('timeout', timeout)
else:
timeout = 15
print('timeout', timeout)
if str(len(rec)) == page:
return
else:
urls_t = extract_urls(html)
tasks1 = []
for item in urls_t:
p = item.split('/')[-2]
# print(p, item)
if p in rec:
# print('in')
continue
tasks1.append(get_jpeg_url(item, session, path))
await asyncio.gather(*tasks1)
async def comic_handler_r(url, root_path, session):
global timeout
html = await fetch(url, session)
pat = '/artist/.*?>(.*?) <'
arti = re.findall(pat, html)
try:
arti = arti[0].replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]')
except Exception as e:
print('in comic handler r:', e)
arti = 'no_name'
if arti and arti != 'con':
await create_dir(root_path, arti)
path = root_path+'\\'+arti
else:
await create_dir(root_path, 'no_name')
path = root_path+'\\no_name'
await comic_handler(url, path, session)
import time
async def page_handler(comics_urls, arti_path, session):
for comic_link in comics_urls: # choose one of his work
print(time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())))
# try:
await comic_handler(comic_link, arti_path, session)
# except Exception as e:
# print(e)
async def unique_comic_names(arti_base_link, session):
i = 0
urls_names = {}
while True:
try:
html = await fetch(arti_base_link+'chinese/page/'+str(i), session)
urls_t_names_t = extract_comics_links(html)
urls_names.update(urls_t_names_t)
i += 1
except Exception as e:
print('in unique', e)
break
return len(list(set([s.lower().replace(' ', '') for s in urls_names.values()]))), urls_names
async def check_arti(num, arti_path, urls_names, session):
rec = os.listdir(arti_path)
print('need:', num, 'now comics:', len(rec))
if len(rec) == 0:
return False
if len(rec) >= num:
for name in rec:
comic_path = arti_path+'\\'+name
recc = os.listdir(comic_path)
with open(comic_path+'\\html.txt', 'r', encoding='utf8') as f:
t = f.read()
pat = '共 (.*?) 頁'
patt = '<h\d>(.*?)</h\d>'
try:
k = int(re.findall(pattern=pat, string=t)[0])
except Exception as e:
print('in', name, e)
k = int(re.findall(pattern='共(.*?)頁', string=t)[0])
kk = re.findall(pattern=patt, string=t)
print('need:', k, 'now:', len(recc)-1, name)
f = 0
if len(recc)-1 < k:
for n in kk:
if f:
break
for u, nn in urls_names.items():
if f:
break
if n == nn:
print(time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())))
await comic_handler(u, arti_path, session)
f += 1
print(f)
if not f:
return False
else:
return False
return True
async def main(start):
async with aiohttp.ClientSession() as session:
'''
选择爬取模式
||
\/
'''
'''get-all-site`s-work'''
# lock = Lock()
# for i in range(331682, 1, -1):
# await lock.acquire()
# await comic_handler('https://nyahentai.club/g/'+str(i)+'/', session)
# lock.release()
'''only-one'''
# await comic_handler('https://zh.qqhentai.com/g/333292/', BASE_DIR, session)
'''get specific artist'''
# arti_name = 'ratatatat74'
# arti_base_link = 'https://zh.yyhentai.com/artist/ratatatat74-mr-skull/chinese/'
# await create_dir(BASE_DIR, 'all_co')
# root_path = BASE_DIR + '\\all_co'
# await create_dir(root_path, arti_name)
# arti_path = root_path + '\\' + arti_name
# i = 1
# while (True): # choose one of his page
# try:
# html_all_comics = await fetch(arti_base_link + 'page/' + str(i), session)
# print(arti_base_link + 'page/' + str(i))
# comics_urls = extract_comics_links(html_all_comics)
#
# await page_handler(comics_urls, arti_path, session)
# # for comic_link in comics_urls: # choose one of his work
# # await comic_handler(comic_link, arti_path, session)
# i += 1
#
#
# except Exception as e:
# print('fail or end on artist', arti_name, e)
# break
'''get recent ch comics'''
await create_dir(BASE_DIR, 'all_co')
root_path = BASE_DIR + '\\all_co'
for j in range(1, 1000):
print('pg:', j)
rec_html = await fetch(base_url+'language/chinese/page/'+str(j), session)
comic_urls = extract_rec_links(rec_html)
for url in comic_urls:
await comic_handler_r(url, root_path, session)
# '''get-all-site`s-hot-artists-work'''
# await create_dir(BASE_DIR, 'all_co')
# root_path = BASE_DIR+'\\all_co'
#
# for j in range(15, 180): # artists page
# print(j)
# html_all_artists = await fetch(artist_hot_url + str(j), session)
#
# artists_base_urls = extract_artists_links(html_all_artists) # one page
#
# for arti_name, arti_base_link in artists_base_urls.items(): # choose one artist
#
# await create_dir(root_path, arti_name)
# arti_path = root_path+'\\'+arti_name
#
# # try:
#
# k, comics_urls_names = await unique_comic_names(arti_base_link, session)
# print(arti_name, 'artist works', k)
#
# if await check_arti(int(k), arti_path, comics_urls_names, session):
# print('next artist===============================================================')
# continue
#
# await page_handler(comics_urls_names.keys(), arti_path, session)
# print('next artist===============================================================')
# # except Exception as e:
# # print('fail or end on artist', arti_name, e)
# # break
if __name__ == "__main__":
# print(BASE_DIR)
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
# tha_main()
# =========================================================
————————————————
版权声明:本文为CSDN博主「是脑瘫啊」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/lafea/article/details/108957383