본문 바로가기
카테고리 없음

Python Selenium Multiprocessing 에러

by 이승보 2021. 3. 8.
728x90
# -*- coding: utf8 -*- 
# from selenium import webdriver
# from selenium.webdriver.common.keys import Keys


from multiprocessing import Pool
from selenium import webdriver
from webdriver_manager.chrome  import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
import time
import urllib.request
import os
import cv2
import numpy as np
import sys



class NamedObject:
    def __init__(self, name, obj):
        self.name = name
        self.obj = obj
    # def __getattr__(self,attr):
    #     if attr == 'name':
    #         return self.name
    #     else :
    #         return getattr(self.obj, attr)

case01 = NamedObject('case01', ["송혜교", "김준호","고소영","전지현"])
case02 = NamedObject('case02', ["김민종","산다라박","박봄","송중기"])

crwalingList =[case01, case02]


FACE_CASCADE = cv2.CascadeClassifier('D:\ex_crawling\winSelenium\haarcascade_frontalface_default.xml')
FULL_PATH_WINDOW ="D:/ex_crawling/ImageFaceSource/"
SCROLL_PAUSE_TIME = 1.5  #스크롤 내리는 시간
IMG_LODING_TIME = 1.5   #클릭 후 이미지 로딩하는 시간
maxImgCount = 201     #크롤링할 이미지 개수
START_TIME = time.time()


def imwrite(filename, img, params=None): 
    try: 
        ext = os.path.splitext(filename)[1]
        result, n = cv2.imencode(ext, img, params)
        if result:
            with open(filename, mode='w+b') as f:
                n.tofile(f)
            return True
        else: 
            return False 
    except Exception as e: 
        print(e) 
        return False


def url_to_image(url):
	resp = urllib.request.urlopen(url)
	image = np.asarray(bytearray(resp.read()), dtype="uint8")
	image = cv2.imdecode(image, cv2.IMREAD_COLOR)
	#image = cv2.imdecode(image, cv2.IMREAD_GRAYSCALE)
    # return the image
	return image


def makeFolder(tPath) :
    #print("start make folder")
    try : 
        if not os.path.exists(tPath):
            os.makedirs(tPath)
            #print(tPath)
        return tPath
    except OSError :
        print ('Error: Creating directory. ' +  tPath)

def scrollToBottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight") #자바스크립트의 헤이트를 확인해서 가져옴
    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") #브라우저 0부터 끝까지 스크롤을 내리겠다.
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME) #1초동안 로딩 기다림. 로딩 끝나고 나면
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight") #다시 브라우저 높이를 구함
        if new_height == last_height: #내렸을 때 더이상 나오는게 없다면
            try : #try 안에꺼를 실행하는데 오류가 나면 except 실행함.
                driver.find_element_by_css_selector(".mye4qd").click()
            except:
                #print("except line height")
                break;
        last_height = new_height

def do_crawl_test(name:str, tempPath:str):
    #print("@@@@@@@@@Start do_crawl_test %s with %s"%(name, tempPath))
    eachCrawl_start = time.time()

    options = webdriver.ChromeOptions()
    # options.add_experimental_option("detach", True)
    # options.add_argument('--ignore-certificate-errors')
    # options.add_argument('--ignore-ssl-errors')
    # options.add_experimental_option("excludeSwitches", ["enable-logging"])


    #driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    # driver = webdriver.Chrome(chrome_options=options,executable_path=r'D:\ex_crawling\winSelenium\chromedriver.exe')
    driver = webdriver.Chrome(executable_path=r'D:\ex_crawling\winSelenium\chromedriver.exe')
    
    driver.get("https://www.google.co.kr/imghp?hl=ko&ogbl")
    print(name)
    elem = driver.find_element_by_name("q")
    elem.send_keys(name)
    time.sleep(IMG_LODING_TIME)
    elem.send_keys(Keys.RETURN)
    time.sleep(IMG_LODING_TIME)

    # Get scroll height
    scrollToBottom(driver)

    imgList = driver.find_elements_by_css_selector(".rg_i.Q4LuWd")
    count = 1
    for img in imgList:
        try : 
            img.click()
            time.sleep(IMG_LODING_TIME)
            img = driver.find_elements_by_css_selector(".n3VNCb")[1]
            imgUrl = driver.find_elements_by_css_selector(".n3VNCb")[1].get_attribute("src")
            #아래처럼 구한 이미지사이즈는 화면에 표시되는 사이즈를 나타냄..
            #imgWidth = int(float(img.get_attribute("style").split(";")[0].split(":")[1].split("px")[0])) 
            #imgHeight = int(float(img.get_attribute("style").split(";")[1].split(":")[1].split("px")[0]))
            
            tImg = url_to_image(imgUrl)
            imgHeight = np.size(tImg,0)
            imgWidth = np.size(tImg,1)
            

            checkImg = True
            # if (imgWidth > 200 and imgHeight > 200) or imgWidth > 250 or imgHeight > 250 :
            #     checkImg = True
            # else :
            #     print("small size img")
            

            if checkImg == True :                
                #기존 URL로 저장하는 코드
                #urllib.request.urlretrieve(imgUrl, tempPath + "\\" + name + str(count) + ".jpg")                           
    
                grayImg = cv2.cvtColor(tImg, cv2.COLOR_BGR2GRAY)                    
                faces = FACE_CASCADE.detectMultiScale(grayImg, 1.2, 5)

                #print("face number is ",len(faces))
                for (x, y, w, h) in faces:
                    top = y 
                    bottom = y + h if (y+h) < imgHeight else imgHeight
                    left = x
                    right = x + w if (x+w) < imgWidth else imgWidth
                    
                    if (bottom - top) < 100 or (right-left) < 100 :
                        #print("%s: small size face : image Height= %f, Widht=%f : bottom(%f)-top(%f) = %f x right(%f)-left(%f) = %f"%(name,imgHeight,imgWidth,bottom,top,(bottom-top),right,left,(right-left)))
                        continue

                    faceImg = tImg[top:bottom,left:right]
                    grayFaceImg = cv2.cvtColor(faceImg, cv2.COLOR_BGR2GRAY)
                    print(tempPath + "/%s_%d.jpg"%(name,count))
                    imwrite(tempPath + "/%s_%d.jpg"%(name,count),grayFaceImg)
                    count = count + 1
                    #print("%3d_after save img"%count)
            if count >= maxImgCount :
                #print("finish finish")
                break
        except:
            #print(name,"there is some problem .........img pass")
            pass
    print(name," ", count, "장", "소요시간 : ",time.time() - eachCrawl_start)
    driver.close() 

def do_process_with_thread_crawl(crawltheme:str):
    try : 
        #print("start Process")    
        #do_thread_crawl(get_theme_nameList(crawltheme),makeFolder(os.path.join(FULL_PATH_WINDOW,get_theme_name(crawltheme))))
        do_thread_crawl(crawltheme.obj,makeFolder(os.path.join(FULL_PATH_WINDOW,crawltheme.name)))
    except RecursionError :
        print("*************************recursionError here")   

def do_thread_crawl(names:list, themePath:str):
    #print("**************start thread*******************")
    thread_list = []
    with ThreadPoolExecutor(max_workers=2) as executor:
        for name in names:
            tempPath = makeFolder(os.path.join(themePath,name))
            #print("%s : %s"%(name,tempPath))
            thread_list.append(executor.submit(do_crawl_test, name,tempPath))
        for execution in concurrent.futures.as_completed(thread_list) :
            execution.result()

def do_only_process_crawl(name:str):
    try : 
        do_crawl_test(name, makeFolder(os.path.join(FULL_PATH_WINDOW,'men_2010',name)))
    except : 
        print("fail only process")
 

if __name__ == "__main__":
    ################# 멀티스레드 미적용 / 멀티프로세싱 적용 ####################
    for crawlTheme in crwalingList :
        print("****************Start :" + crawlTheme.name)
        themePath = makeFolder(os.path.join(FULL_PATH_WINDOW,crawlTheme.name))
        with Pool(processes=2) as pool:
                try :
                    pool.map(do_only_process_crawl, crawlTheme.obj)
                except RecursionError :
                    print("*************************recursionError here")

            ##########################################################
    print("****************전체 소요시간 : %4f 초"%(time.time()-START_TIME))

    # ################# 멀티스레드 + 멀티프로세싱 적용 ####################
    
    # with Pool(processes=2) as pool:
    #     try :
    #         pool.map(do_process_with_thread_crawl, crwalingList)
    #     except RecursionError :
    #         print("*************************recursionError here")

    #         ##########################################################
    #print("****************전체 소요시간 : %4f 초"%(time.time()-START_TIME))


    # ############ 멀티스레드 적용 / 멀티 프로세싱 미적용###############
    # for crawlTheme in crwalingList :
    #         ########## get_sblist
    #         print("****************Start :" + crawlTheme.name)
    #         themePath = makeFolder(os.path.join(FULL_PATH_WINDOW,crawlTheme.name))

    #         do_thread_crawl(crawlTheme.obj,themePath)
    # print("****************전체 소요시간 : %4f 초"%(time.time()-START_TIME))
    # ##############################################################

    # ############ 멀티스레드,프로세싱 적용 전###############
    # for crawlTheme in crwalingList :
    #     print("****************Start :" + crawlTheme.name)
    #     themePath = makeFolder(os.path.join(FULL_PATH_WINDOW,crawlTheme.name))
    #     for name in crawlTheme.obj:
    #         ######### do_html_crawl
    #         print("******************Start :" + name);
    #         tempPath = makeFolder(os.path.join(themePath,name))
    #         do_crawl_test(name,tempPath)
    #     print("****************전체 소요시간 : %4f 초"%(time.time()-START_TIME))
    # ##############################################################



 

이미지 크롤링을 하는데 멀티쓰레드로 하면 계속해서 동작하다가 크롬 브라우져가 죽어버린다....

구글링을 해봤으나...다들 크롬 브라우저와 드라이버 버전을 일치시키라는 얘기들만 하는데 이미 맞춰져 있는걸 ㅜㅜ

멀티쓰레드를 포기하고 하나씩 해야 하는건가 하다가 혹시 몰라서 멀티프로세싱으로 하니 잘 되네....

 

이런 거 원인 파악하는데 시간을 들이는게 공부하는건데 그러기에는 아직 파이썬을 얕게 다루는데 너무 깊게 알려고 하다가 시간이 많이 갈 듯하니...

원인은 모르겠다만....일단 ㄱ...

 

728x90
반응형

댓글