From 85f166eeecf2d8ce60b606f39354c59542825b87 Mon Sep 17 00:00:00 2001 From: msglm Date: Wed, 11 Jan 2023 20:51:08 -0600 Subject: No more browser required, uses pure python and nothing more. Much faster and tells you percent completed when downloading the video metadata. --- comedyGenerator | 249 +++++++++++++++++++++++++------------------------------- 1 file changed, 112 insertions(+), 137 deletions(-) diff --git a/comedyGenerator b/comedyGenerator index f281ed2..5b28284 100755 --- a/comedyGenerator +++ b/comedyGenerator @@ -6,118 +6,95 @@ import os import argparse import time import hashlib +import json parser = argparse.ArgumentParser(add_help=True) parser.add_argument('--source', - default=False, - action='store_true', - dest='source', - help='Links to the source code of the software' - ) + default=False, + action='store_true', + dest='source', + help='Links to the source code of the software' + ) parser.add_argument('--debug', - default=False, - action='store_true', - dest='debug', - help='Enables debug output' - ) - -parser.add_argument('-js','-JS',"--Javascript",'--javascript', - default=True, - action='store_true', - dest='usingJavascript', - help='Enables the user of a webdriver to scrape funnies' - ) + default=False, + action='store_true', + dest='debug', + help='Enables debug output' + ) parser.add_argument('--amount','-a', - default=0, - dest='amount', - action="store", - help="the amount of funnies you'd like to download per tag", - type=int - ) + default=0, + dest='amount', + action="store", + help="the amount of funnies you'd like to download per tag", + type=int + ) parser.add_argument('tags', - nargs='+', - type=str, - help='Provides tags to be check for funny downloading' - ) + nargs='+', + type=str, + help='Provides tags to be check for funny downloading' + ) args = parser.parse_args() if args.source: print("https://git.snootgame.xyz/PrincipalSpears/comedyGenerator") - -if not args.usingJavascript: - from bs4 import BeautifulSoup - print("using html mode...") - for tags in args.tags: - URL = requests.get('https://ifunny.co/tags/' + tags + '?filter=video') - webpage = BeautifulSoup(URL.content, 'html.parser') - videos = webpage.findAll('video') - if args.amount > 0: - videos = video[:arg.amount] - for video in videos: - name = tags + "-" + hashlib.md5(video.encode('utf-8')).hexdigest() + ".mp4" - print("saving " + video['data-src'] + " as " + name) - urllib.request.urlretrieve(video['data-src'], '/home/' + os.environ['USER'] + '/Videos/unsorted/' + name) -elif args.usingJavascript: - from selenium import webdriver - from selenium.webdriver.common.by import By - print("using JS mode...") - - try: - if args.debug: - print("testing if firefox works...") - from selenium.webdriver.firefox.options import Options - firefox_options = Options() - firefox_options.add_argument("--headless") - driver = webdriver.Firefox(options=firefox_options) - if args.debug: - print("Firefox Works!") - - except: - if args.debug: - print("Firefox didn't work! Trying Chrome!") - try: - from selenium.webdriver.chrome.options import Options - chrome_options = Options() - #chrome_options.add_argument("--headless") #TODO completely breaks script and screenshot shows a white screen and nothing but. likely got discovered. - driver = webdriver.Chrome(options=chrome_options) - if args.debug: - print("Chrome Works!") - except: - if args.debug: - print("Chrome Failed! Going to attempt an install of the firefox webdriver") - try: - from selenium.webdriver.firefox.options import Options - firefox_options = Options() - firefox_options.add_argument("--headless") - driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(), options=firefox_options) - if args.debug: - print("Install successful! using Firefox!") - except: - if args.debug: - print("Install Failed! Trying Chrome webdriver install!") - try: - from selenium.webdriver.chrome.options import Options - chrome_options = Options() - chrome_options.add_argument("--headless") - driver = webdriver.Chrome(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install(), options=chrome_options) - except: - print("Could not find webdriver!") - print("You'll have to manually install a webdriver to your path") - print("If you are using GNU/Linux, it is likely that you can install from your standard repos. Debian labels their chromium driver chromium-driver. If you wish to use an ungoogled version of chromium (as to reduce possiblity of spying), you can find a link to that here: https://github.com/Eloston/ungoogled-chromium#downloads. On Debian (or debian likes such as Ubuntu or Devuan), you may then run apt install ungoogled-chromium-driver and this will no longer fail.") - sys.exit() - for tags in args.tags: + sys.exit(0) + +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "cross-site", + "Cache-Control": "max-age=0" + } + +videos = [] +for tags in args.tags: if args.debug: print("Downloading Tag: " + tags) + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "cross-site", + "Cache-Control": "max-age=0" + } + + master = requests.get('https://ifunny.co/', headers=headers) + combineHeader = (dict(master.headers)|headers) + requestHeader = { + "User-Agent":combineHeader['User-Agent'], + "Content-Type":combineHeader['Content-Type'], + "x-requested-with": "fetch", + "x-csrf-token": combineHeader['Set-Cookie'].split(';')[0].split('=')[1], + "set-cookies":combineHeader['Set-Cookie'], + "access-control-allow-headers":combineHeader['access-control-allow-headers'] + } + requestCookies = { + "CID" : combineHeader['Set-Cookie'].split(';')[3].split('=')[2], + "sound" : "off", + "viewMode" : "list", + "x-csrf-token": combineHeader['Set-Cookie'].split(';')[0].split('=')[1] + } for tries in range(100): try: - driver.get('https://ifunny.co/tags/' + tags + '?filter=video') + tagPage = requests.get("https://ifunny.co/api/v1/feeds?filter=video&tag=" + tags, headers=requestHeader, cookies=requestCookies) if args.debug: print("Got Webpage!") except: @@ -127,65 +104,63 @@ elif args.usingJavascript: continue break - if args.amount > 0: - if args.debug: - print("starting to scroll...") - isTimesScrolled = 0 - oughtTimeScrolled = (args.amount/10) + 1 - while isTimesScrolled < int(oughtTimeScrolled): - driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") - isTimesScrolled = isTimesScrolled + 1 - time.sleep(3) - if args.debug: - print("Need to scroll " + str(oughtTimeScrolled-isTimesScrolled) + " more times...") - videos = driver.find_elements(By.TAG_NAME,"video") - if args.debug: - print("Original Videos List: ") - print(videos) - if args.amount > 0: + JSONDump = tagPage.json() + while len(videos) < args.amount: + print("Currently have " + str(len(videos)) + " videos out of " + str(args.amount) + " (" + str((len(videos)/args.amount)*100) + "%)") + for item in range(len(JSONDump['items'])): + videos.append(JSONDump['items'][item]['url']) + for tries in range(100): + try: + tagPage = requests.get("https://ifunny.co/api/v1/feeds?filter=video&tag=" + tags + "&next=" + JSONDump['pagination']['next'], headers=requestHeader, cookies=requestCookies) + JSONDump = tagPage.json() + if args.debug: + print("Got New Tag Page!") + break + except: + if tries < 100 - 1: + print("Rate Limited! Sleeping for " + str(tries*1.5) + " seconds!") + time.sleep(tries*1.5) + continue + if len(videos) > args.amount: videos = videos[:args.amount] if args.debug: print("Videos list truncated! Its now: " + str(len(videos)) + " units long") for video in videos: print("Now running for " + str(video)) - URL = video.get_attribute("data-src") - - if isinstance(URL, str): + + if isinstance(video, str): if args.debug: - print("URL read as: " + URL) - name = tags + "-" + hashlib.md5(URL.encode('utf-8')).hexdigest() + ".mp4" + print("URL read as: " + video) + name = tags + "-" + hashlib.md5(video.encode('utf-8')).hexdigest() + ".mp4" + path = '/home/' + os.environ['USER'] + '/Videos/unsorted/' + name if args.debug: print("name read as: " + name) - print("saving " + video.get_attribute("data-src") + " as " + name) - for tries in range(100): - try: - urllib.request.urlretrieve(video.get_attribute("data-src"), '/home/' + os.environ['USER'] + '/Videos/unsorted/' + name) - except: + print("saving " + video + " as " + name) + if os.path.exists(path): + print(name + " already exists!") + else: + for tries in range(100): + try: + urllib.request.urlretrieve(video, path) + except: if tries < 100 - 1: print("Rate Limited! Sleeping for " + str(tries*1.5) + " seconds!") time.sleep(tries*1.5) - continue - break - #urllib.request.urlretrieve(video.get_attribute("data-src"), '/home/' + os.environ['USER'] + '/Videos/unsorted/' + name) + continue + break + #urllib.request.urlretrieve(video.get_attribute("data-src"), '/home/' + os.environ['USER'] + '/Videos/unsorted/' + name) elif args.debug: - print("URL is NOT a string, it is a " + str(type(URL))) - - + print("URL is NOT a string, it is a " + str(type(video))) - driver.quit() -else: - print("Neither Javascript or HTML was given!") - sys.exit(1) - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License version 3 as published by -# the Free Software Foundation. +#This program is free software: you can redistribute it and/or modify +#it under the terms of the GNU Affero General Public License version 3 as published by +#the Free Software Foundation. # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. +#This program is distributed in the hope that it will be useful, +#but WITHOUT ANY WARRANTY; without even the implied warranty of +#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +#GNU Affero General Public License for more details. # -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . +#You should have received a copy of the GNU Affero General Public License +#along with this program. If not, see . -- cgit v1.2.3