阅读 3330

python协程异步aiohttp爬虫:喵绅士2020最新爬虫

python协程异步aiohttp爬虫:喵绅士2020最新爬虫

这个已经是全站爬虫了


10.21更新

11.13更新

12.1更新


from pyquery import PyQuery

import aiohttp

import re

import asyncio

from urllib import parse

import urllib.request

from lxml import etree

import os


stopping = False


# BASE_DIR = os.getcwd()

BASE_DIR = 'H:\\'


base_url = 'https://zha.qqhentai.com/'


artist_hot_url = 'https://zha.qqhentai.com/artists/hot/page/'


sem = asyncio.Semaphore(500)


timeout = 15



async def create_dir(path, name):

    tar = os.path.exists(path + '\\' + name)

    if not tar:

        print('new dir !!!!!!!!!', name)

        try:

            os.mkdir(path + '\\' + name)

        except Exception as e:

            print('in creating dir: ', e)



async def fetch(url, session):

    async with sem:

        try:

            async with session.get(url) as resp:

                # print('url status:{}  {}'.format(url, resp.status))

                if resp.status in [200, 201]:

                    data = await resp.text()

                    return data

        except Exception as e:

            print(e)



async def fetch_file(url, session):

    global timeout

    async with sem:

        try:

            async with session.get(url, timeout=timeout) as resp:

                # print('url status:{}  {}'.format(url, resp.status))

                if resp.status in [200, 201]:

                    data = await resp.read()

                    return data

                else:

                    tpe = url.split('.')[-1]

                    # print(tpe)


                    if tpe == 'png':

                        url = url.replace('png', 'jpg')

                    elif tpe == 'jpg':

                        url = url.replace('jpg', 'png')

                    # print(url)


            if resp not in [200, 201]:

                async with session.get(url, timeout=timeout) as resp2:

                    # print('url status:{}  {}'.format(url, resp2.status))

                    if resp2.status in [200, 201]:

                        data = await resp2.read()

                        return data


        except Exception as e:

            print(e)



# write down your logic

def extract_urls(html):

    urls = []

    pq = PyQuery(html)

    pq = pq('#thumbnail-container')


    for link in pq.items('a'):

        url = link.attr('href')

        url = base_url+url

        # print(url)

        urls.append(url)

    return urls



def extract_artists_links(html):

    pq = PyQuery(html)

    pq = pq('#tag-container')

    # print(pq)

    urls = {}

    for link in pq.items('a'):

        url = link.attr('href')

        name = link.text().split(' (')[0].replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]')

        # print(url)

        urls.update({name: parse.urljoin(base_url, url)})

    print(urls)

    return urls



def extract_comics_links(html):

    pq = PyQuery(html)

    # print(pq)

    pq = pq('.container.index-container') # 带空格的属性!


    urls_names = {}

    for link in pq.items('a'):

        url = link.attr('href')

        name = link('img').attr('alt')

        # print(url, name)

        urls_names.update({parse.urljoin(base_url, url): name})

    # print(urls_names)

    return urls_names


def extract_rec_links(html):

    pat = '/g/\d+/'

    urls = re.findall(pat, html)

    urls_f = []

    for url in urls:

        url = parse.urljoin(base_url, url)

        urls_f.append(url)

    print(len(urls), urls)

    print(len(urls_f), urls_f)

    return urls_f



async def get_jpeg_url(url, session, path):

    html = await fetch(url, session)

    try:

        tree = etree.HTML(html)

        real_url = tree.xpath('//section[@id="image-container"]/a/img/@src')

        await get_jpeg(real_url[0], session, path)

    except Exception as e:

        print(e)



async def get_jpeg(url, session, path):

    global timeout

    num = url.split('/')[-1]

    path_t = path+'\\'+num

    tar = os.path.exists(path_t)

    # print(url)

    if not tar:

        data = await fetch_file(url, session)

        if data:

            # print(type(data))

            with open(path_t, 'wb') as f:

                print(path_t, timeout)

                f.write(data)

        else:

            print('fail on page ', url, 'timeout', timeout)



async def comic_handler(url, path_o, session):

    global timeout

    html = await fetch(url, session)


    pq = PyQuery(html)

    pq = pq('#info')

    # print(pq)

    for i in range(10):

        name = pq('h'+str(i)).text()

        if name:

            break


    pat = '<div>.*?(\d+).*?</div>'

    page = re.findall(pat, str(pq))[0]

    # print(page)

    name = str(name).replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]')


    if name == '':

        name = 'None'


    print(path_o.split('\\')[-1])

    name = re.compile('1-\d+').sub('1-x', name)

    name = re.compile('1~\d+').sub('1~x', name)


    # 建立目标文件夹

    await create_dir(path_o, name)

    path = path_o+'\\'+name


    print(name, url, path)

    '''html-detail'''

    with open(path + '\\html.txt', 'w', encoding='utf-8') as f:

        f.write(str(pq))


    rec = os.listdir(path)

    try:

        del rec[rec.index('html.txt')]

    except Exception as e:

        print(e)


    for item in rec:

        rec[rec.index(item)] = item.split('.')[0]


    print(sorted(list(map(int, rec))), int(page), len(rec), int(page)-len(rec))

    if int(page)-len(rec) >= 50:

        timeout = 60

        print('timeout', timeout)

    else:

        timeout = 15

        print('timeout', timeout)


    if str(len(rec)) == page:

        return

    else:

        urls_t = extract_urls(html)


        tasks1 = []


        for item in urls_t:

            p = item.split('/')[-2]

            # print(p, item)

            if p in rec:

                # print('in')

                continue

            tasks1.append(get_jpeg_url(item, session, path))


        await asyncio.gather(*tasks1)



async def comic_handler_r(url, root_path, session):

    global timeout

    html = await fetch(url, session)

    pat = '/artist/.*?>(.*?) <'

    arti = re.findall(pat, html)

    try:

        arti = arti[0].replace('/', '-').replace('\\', '-').replace('\n', '-').replace('?', '[ask]').replace('|', '[竖杠]').replace('+', '[add]').replace('{', '[left]').replace('}', '[right]').replace(':', '[mao_hao]').replace('"', '[双引]').replace('*', '[star]').replace('>', '[b-than]').replace('<', '[s-than]')

    except Exception as e:

        print('in comic handler r:', e)

        arti = 'no_name'


    if arti and arti != 'con':

        await create_dir(root_path, arti)

        path = root_path+'\\'+arti

    else:

        await create_dir(root_path, 'no_name')

        path = root_path+'\\no_name'


    await comic_handler(url, path, session)



import time

async def page_handler(comics_urls, arti_path, session):

    for comic_link in comics_urls:  # choose one of his work

        print(time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())))


        # try:

        await comic_handler(comic_link, arti_path, session)

        # except Exception as e:

        #     print(e)



async def unique_comic_names(arti_base_link, session):

    i = 0


    urls_names = {}

    while True:

        try:

            html = await fetch(arti_base_link+'chinese/page/'+str(i), session)

            urls_t_names_t = extract_comics_links(html)

            urls_names.update(urls_t_names_t)


            i += 1

        except Exception as e:

            print('in unique', e)

            break


    return len(list(set([s.lower().replace(' ', '') for s in urls_names.values()]))), urls_names



async def check_arti(num, arti_path, urls_names, session):

    rec = os.listdir(arti_path)

    print('need:', num, 'now comics:', len(rec))


    if len(rec) == 0:

        return False


    if len(rec) >= num:

        for name in rec:

            comic_path = arti_path+'\\'+name

            recc = os.listdir(comic_path)

            with open(comic_path+'\\html.txt', 'r', encoding='utf8') as f:

                t = f.read()

                pat = '共 (.*?) 頁'

                patt = '<h\d>(.*?)</h\d>'

                try:

                    k = int(re.findall(pattern=pat, string=t)[0])

                except Exception as e:

                    print('in', name, e)

                    k = int(re.findall(pattern='共(.*?)頁', string=t)[0])

                kk = re.findall(pattern=patt, string=t)

                print('need:', k, 'now:', len(recc)-1, name)

                f = 0

                if len(recc)-1 < k:

                    for n in kk:

                        if f:

                            break

                        for u, nn in urls_names.items():

                            if f:

                                break

                            if n == nn:

                                print(time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())))

                                await comic_handler(u, arti_path, session)

                                f += 1

                    print(f)

                    if not f:

                        return False

    else:

        return False


    return True



async def main(start):

    async with aiohttp.ClientSession() as session:

        '''

                                  选择爬取模式

                                      ||

                                      \/

        '''

        '''get-all-site`s-work'''

        # lock = Lock()

        # for i in range(331682, 1, -1):

        #     await lock.acquire()

        #     await comic_handler('https://nyahentai.club/g/'+str(i)+'/', session)

        #     lock.release()


        '''only-one'''

        # await comic_handler('https://zh.qqhentai.com/g/333292/', BASE_DIR,  session)


        '''get specific artist'''

        # arti_name = 'ratatatat74'

        # arti_base_link = 'https://zh.yyhentai.com/artist/ratatatat74-mr-skull/chinese/'

        # await create_dir(BASE_DIR, 'all_co')

        # root_path = BASE_DIR + '\\all_co'

        # await create_dir(root_path, arti_name)

        # arti_path = root_path + '\\' + arti_name

        # i = 1

        # while (True):  # choose one of his page

        #     try:

        #         html_all_comics = await fetch(arti_base_link + 'page/' + str(i), session)

        #         print(arti_base_link + 'page/' + str(i))

        #         comics_urls = extract_comics_links(html_all_comics)

        #

        #         await page_handler(comics_urls, arti_path, session)

        #         # for comic_link in comics_urls: # choose one of his work

        #         #     await comic_handler(comic_link, arti_path, session)

        #         i += 1

        #

        #

        #     except Exception as e:

        #         print('fail or end on artist', arti_name, e)

        #         break



        '''get recent ch comics'''

        await create_dir(BASE_DIR, 'all_co')

        root_path = BASE_DIR + '\\all_co'


        for j in range(1, 1000):

            print('pg:', j)

            rec_html = await fetch(base_url+'language/chinese/page/'+str(j), session)


            comic_urls = extract_rec_links(rec_html)


            for url in comic_urls:

                await comic_handler_r(url, root_path, session)





        # '''get-all-site`s-hot-artists-work'''

        # await create_dir(BASE_DIR, 'all_co')

        # root_path = BASE_DIR+'\\all_co'

        #

        # for j in range(15, 180): # artists page

        #     print(j)

        #     html_all_artists = await fetch(artist_hot_url + str(j), session)

        #

        #     artists_base_urls = extract_artists_links(html_all_artists) # one page

        #

        #     for arti_name, arti_base_link in artists_base_urls.items(): # choose one artist

        #

        #         await create_dir(root_path, arti_name)

        #         arti_path = root_path+'\\'+arti_name

        #

        #         # try:

        #

        #         k, comics_urls_names = await unique_comic_names(arti_base_link, session)

        #         print(arti_name, 'artist works', k)

        #

        #         if await check_arti(int(k), arti_path, comics_urls_names, session):

        #             print('next artist===============================================================')

        #             continue

        #

        #         await page_handler(comics_urls_names.keys(), arti_path, session)

        #         print('next artist===============================================================')

        #         # except Exception as e:

        #         #     print('fail or end on artist', arti_name, e)

        #         #     break



if __name__ == "__main__":

    # print(BASE_DIR)

    loop = asyncio.get_event_loop()

    loop.run_until_complete(main(loop))

    # tha_main()



# =========================================================

————————————————

版权声明:本文为CSDN博主「是脑瘫啊」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。

原文链接:https://blog.csdn.net/lafea/article/details/108957383


文章分类
后端
版权声明:本站是系统测试站点,无实际运营。本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至 XXXXXXo@163.com 举报,一经查实,本站将立刻删除。
相关推荐