Phú Nguyễn

26 bundles
1 file3 months ago
2

manhua.dmzj chapter crawler

index.py

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import urllib
import urllib.parse
import urllib.request
import os
import requests
import shutil


def downloadFiles(page, path):
    urllib.request.urlretrieve(page, path)


url = 'https://manhua.dmzj.com/yuzhouxiaomaoqiu/38643.shtml'
lazyload = "lazyload.gif"

driver = webdriver.Firefox()
driver.get(url)

removeLoginBox = driver.find_element_by_class_name("login_tip")
webdriver.ActionChains(driver).move_to_element(
    removeLoginBox).click(removeLoginBox).perform()

listView = driver.find_element_by_id("qiehuan_txt")
webdriver.ActionChains(driver).move_to_element(
    listView).click(listView).perform()

try:
    element = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CLASS_NAME, "inner_img")))
finally:
    context = driver.execute_script(
        "return document.documentElement.outerHTML")

    web_soup = BeautifulSoup(context, 'html.parser')

    sources = []

    getName = web_soup.findAll("a", {"class": "redhotl"})
    getChapter = web_soup.findAll("span", {"class": "redhotl"})
    name = ""
    chapterTemp = [element.text for element in web_soup.findAll(
        "span", {"class": "redhotl"})]
    chapter = ""

    for singleName in getName:
        name = singleName["title"]

    chapter = chapterTemp[0]

    wrapper = web_soup.findAll("div", {"class": "inner_img"})
    for container in wrapper:
        images = container.findAll("img")

        for singleImage in images:
            src = singleImage["data-original"]
            http = "http:"
            finalUrl = http + src
            print(finalUrl)
            sources.append(finalUrl)

    print(sources)

    currentPath = os.getcwd()
    newFolder = os.path.join(currentPath, name)

    if not os.path.exists(newFolder):
        os.makedirs(newFolder)

    newPath = os.path.join(newFolder, chapter)

    if not os.path.exists(newPath):
        os.makedirs(newPath)

    for page in sources:
        file_name = os.path.basename(page)
        file_request = requests.get(page)
        protoPath = os.path.join(newPath, file_name)

        urllib.request.urlretrieve(page, protoPath)

driver.quit()