阅读 74

Selenium爬取网站数据

selenium爬取网站数据

调用Chrome浏览器

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import logging
from urllib.parse import urljoin


logging.basicConfig(level=logging.INFO, format=%(asctime)s - %(levelname)s: %(message)s)

INDEX_URL = https://dynamic2.scrape.center/page/{page}
TIME_OUT = 20
TOTAL_PAGE = 10
browser = webdriver.Chrome()
# options = webdriver.ChromeOptions()
# options.add_argument(‘--headless‘)
# browser = webdriver.Chrome(options=options)
wait = WebDriverWait(browser, TIME_OUT)


def scrape_page(url, condition, locator):
    logging.info(scraping %s, url)

    try:
        browser.get(url)
        wait.until(condition(locator))
    except TimeoutException:
        logging.error(error occurred while scraping %s, url, exc_info=True)


def scrape_index(page):
    url = INDEX_URL.format(page=page)
    scrape_page(url, condition=EC.visibility_of_all_elements_located, locator=(By.CSS_SELECTOR, #index .item))


def parse_index():
    elements = browser.find_elements_by_css_selector(#index .item .name)
    for element in elements:
        href = element.get_attribute(href)
        yield urljoin(INDEX_URL, href)


def scrape_detail(url):
    scrape_page(url, condition=EC.visibility_of_element_located, locator=(By.TAG_NAME, h2))


def parse_detail():
    url = browser.current_url
    name = browser.find_element_by_tag_name(h2).text
    categories = [element.text for element in browser.find_elements_by_css_selector(.categories button span)]
    cover = browser.find_element_by_css_selector(.cover).get_attribute(src)
    score = browser.find_element_by_class_name(score).text
    drama = browser.find_element_by_css_selector(.drama p).text
    return {
        url: url,
        name: name,
        categories: categories,
        cover: cover,
        score: score,
        drama: drama
    }


from os import makedirs
from os.path import exists
import json
RESULTS_DIR = results
exists(RESULTS_DIR) or makedirs(RESULTS_DIR)


def save_data(data):
    name = data.get("name")
    data_path = f{RESULTS_DIR}/{name}.json
    json.dump(data, open(data_path, w, encoding=utf-8), ensure_ascii=False, indent=2)


def main():
    try:
        for page in range(1, TOTAL_PAGE + 1):
            scrape_index(page)
            detail_urls = parse_index()
            for detail_url in list(detail_urls):
                logging.info(get detail url %s, detail_url)
                scrape_detail(detail_url)
                detail_data = parse_detail()

                save_data(detail_data)

                logging.info(detail data %s, detail_data)
    finally:
        browser.close()


if __name__ == __main__:
    main()

 

来自拉勾教育《52讲轻松搞定爬虫》

 

谢谢

原文:https://www.cnblogs.com/zhzhang/p/15110608.html

文章分类
代码人生
版权声明:本站是系统测试站点,无实际运营。本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至 XXXXXXo@163.com 举报,一经查实,本站将立刻删除。
相关推荐