No more browser required, uses pure python and nothing more. Much faster and tells you percent completed when downloading the video metadata.

author: msglm <msglm@techchud.xyz> 2023-01-11 20:51:08 -0600
committer: msglm <msglm@techchud.xyz> 2023-01-11 20:51:08 -0600
commit: 85f166eeecf2d8ce60b606f39354c59542825b87 (patch)
tree: 3fb92961e875ef162abcdb8403f0f6b83ad3bddd
parent: defaceb8c3b53d2aefbd7679390e915b20953c19 (diff)
download: comedyGenerator-85f166eeecf2d8ce60b606f39354c59542825b87.tar.gz
comedyGenerator-85f166eeecf2d8ce60b606f39354c59542825b87.tar.bz2
comedyGenerator-85f166eeecf2d8ce60b606f39354c59542825b87.zip
1 files changed, 112 insertions, 137 deletions
diff --git a/comedyGenerator b/comedyGenerator
index f281ed2..5b28284 100755
--- a/comedyGenerator
+++ b/comedyGenerator
@@ -6,118 +6,95 @@ import os
 import argparse
 import time
 import hashlib
+import json
 
 
 
 parser = argparse.ArgumentParser(add_help=True)
 
 parser.add_argument('--source',
-                    default=False,
-                    action='store_true',
-                    dest='source',
-                    help='Links to the source code of the software'
-                    )
+        default=False,
+        action='store_true',
+        dest='source',
+        help='Links to the source code of the software'
+        )
 
 parser.add_argument('--debug',
-                    default=False,
-                    action='store_true',
-                    dest='debug',
-                    help='Enables debug output'
-                    )
-
-parser.add_argument('-js','-JS',"--Javascript",'--javascript',
-                    default=True,
-                    action='store_true',
-                    dest='usingJavascript',
-                    help='Enables the user of a webdriver to scrape funnies'
-                    )
+        default=False,
+        action='store_true',
+        dest='debug',
+        help='Enables debug output'
+        )
 
 parser.add_argument('--amount','-a',
-                    default=0,
-                    dest='amount',
-                    action="store",
-                    help="the amount of funnies you'd like to download per tag",
-                    type=int
-                    )
+        default=0,
+        dest='amount',
+        action="store",
+        help="the amount of funnies you'd like to download per tag",
+        type=int
+        )
 
 parser.add_argument('tags',
-                    nargs='+',
-                    type=str,
-                    help='Provides tags to be check for funny downloading'
-                    )
+        nargs='+',
+        type=str,
+        help='Provides tags to be check for funny downloading'
+        )
 
 args = parser.parse_args()
 
 if args.source:
     print("https://git.snootgame.xyz/PrincipalSpears/comedyGenerator")
-
-if not args.usingJavascript:
-    from bs4 import BeautifulSoup
-    print("using html mode...")
-    for tags in args.tags:
-        URL = requests.get('https://ifunny.co/tags/' + tags + '?filter=video')
-        webpage = BeautifulSoup(URL.content, 'html.parser')
-        videos = webpage.findAll('video')
-        if args.amount > 0:
-            videos = video[:arg.amount]
-        for video in videos:
-            name = tags + "-" + hashlib.md5(video.encode('utf-8')).hexdigest() + ".mp4"
-            print("saving " + video['data-src'] + " as " + name)
-            urllib.request.urlretrieve(video['data-src'], '/home/' + os.environ['USER'] + '/Videos/unsorted/' + name) 
-elif args.usingJavascript:
-    from selenium import webdriver
-    from selenium.webdriver.common.by import By
-    print("using JS mode...")
-    
-    try:
-        if args.debug:
-            print("testing if firefox works...")
-        from selenium.webdriver.firefox.options import Options
-        firefox_options = Options()
-        firefox_options.add_argument("--headless")
-        driver = webdriver.Firefox(options=firefox_options)
-        if args.debug:
-            print("Firefox Works!")
-
-    except:
-            if args.debug:
-                print("Firefox didn't work! Trying Chrome!")
-            try:
-                from selenium.webdriver.chrome.options import Options
-                chrome_options = Options()
-                #chrome_options.add_argument("--headless") #TODO completely breaks script and screenshot shows a white screen and nothing but. likely got discovered.
-                driver = webdriver.Chrome(options=chrome_options)
-                if args.debug:
-                    print("Chrome Works!")
-            except:
-                if args.debug:
-                    print("Chrome Failed! Going to attempt an install of the firefox webdriver")
-                    try:
-                        from selenium.webdriver.firefox.options import Options
-                        firefox_options = Options()
-                        firefox_options.add_argument("--headless")
-                        driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(), options=firefox_options)
-                        if args.debug:
-                             print("Install successful! using Firefox!")
-                    except:
-                        if args.debug:
-                            print("Install Failed! Trying Chrome webdriver install!")
-                            try:
-                                from selenium.webdriver.chrome.options import Options
-                                chrome_options = Options()
-                                chrome_options.add_argument("--headless")
-                                driver = webdriver.Chrome(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install(), options=chrome_options)
-                            except:
-                                    print("Could not find webdriver!")
-                                    print("You'll have to manually install a webdriver to your path")
-                                    print("If you are using GNU/Linux, it is likely that you can install from your standard repos. Debian labels their chromium driver chromium-driver. If you wish to use an ungoogled version of chromium (as to reduce possiblity of spying), you can find a link to that here: https://github.com/Eloston/ungoogled-chromium#downloads. On Debian (or debian likes such as Ubuntu or Devuan), you may then run apt install ungoogled-chromium-driver and this will no longer fail.")
-                                    sys.exit()
-    for tags in args.tags:
+    sys.exit(0)
+
+headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+        "DNT": "1",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+        "Sec-Fetch-Dest": "document",
+        "Sec-Fetch-Mode": "navigate",
+        "Sec-Fetch-Site": "cross-site",
+        "Cache-Control": "max-age=0"
+        }
+
+videos = []
+for tags in args.tags:
         if args.debug:
             print("Downloading Tag: " + tags)
+        headers = {
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.5",
+                "DNT": "1",
+                "Connection": "keep-alive",
+                "Upgrade-Insecure-Requests": "1",
+                "Sec-Fetch-Dest": "document",
+                "Sec-Fetch-Mode": "navigate",
+                "Sec-Fetch-Site": "cross-site",
+                "Cache-Control": "max-age=0"
+            }
+
+        master = requests.get('https://ifunny.co/', headers=headers)
+        combineHeader = (dict(master.headers)|headers)
+        requestHeader = {
+                "User-Agent":combineHeader['User-Agent'],
+                "Content-Type":combineHeader['Content-Type'],
+                "x-requested-with": "fetch",
+                "x-csrf-token": combineHeader['Set-Cookie'].split(';')[0].split('=')[1],
+                "set-cookies":combineHeader['Set-Cookie'],
+                "access-control-allow-headers":combineHeader['access-control-allow-headers']
+                }
+        requestCookies = {
+                "CID" : combineHeader['Set-Cookie'].split(';')[3].split('=')[2],
+                "sound" : "off",
+                "viewMode" : "list",
+                "x-csrf-token": combineHeader['Set-Cookie'].split(';')[0].split('=')[1]
+                }
         for tries in range(100):
             try:
-                driver.get('https://ifunny.co/tags/' + tags + '?filter=video')
+                tagPage = requests.get("https://ifunny.co/api/v1/feeds?filter=video&tag=" + tags, headers=requestHeader, cookies=requestCookies)
                 if args.debug:
                     print("Got Webpage!")
             except:
@@ -127,65 +104,63 @@ elif args.usingJavascript:
                     continue
             break
 
-        if args.amount > 0:
-            if args.debug:
-                print("starting to scroll...")
-            isTimesScrolled = 0
-            oughtTimeScrolled = (args.amount/10) + 1
-            while isTimesScrolled < int(oughtTimeScrolled):
-                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
-                isTimesScrolled = isTimesScrolled + 1
-                time.sleep(3)
-                if args.debug:
-                    print("Need to scroll " + str(oughtTimeScrolled-isTimesScrolled) + " more times...")
-                videos = driver.find_elements(By.TAG_NAME,"video")
-            if args.debug:
-                print("Original Videos List: ")
-                print(videos)
-        if args.amount > 0:
+        JSONDump = tagPage.json()
+        while len(videos) < args.amount:
+            print("Currently have " + str(len(videos)) + " videos out of " + str(args.amount) + " (" + str((len(videos)/args.amount)*100) + "%)")
+            for item in range(len(JSONDump['items'])):
+                videos.append(JSONDump['items'][item]['url'])
+            for tries in range(100):
+                try:
+                    tagPage = requests.get("https://ifunny.co/api/v1/feeds?filter=video&tag=" + tags + "&next=" + JSONDump['pagination']['next'], headers=requestHeader, cookies=requestCookies)
+                    JSONDump = tagPage.json()
+                    if args.debug:
+                        print("Got New Tag Page!")
+                    break
+                except:
+                    if tries < 100 - 1:
+                        print("Rate Limited! Sleeping for " + str(tries*1.5) + " seconds!")
+                        time.sleep(tries*1.5)
+                        continue
+        if len(videos) > args.amount:
             videos = videos[:args.amount]
             if args.debug:
                 print("Videos list truncated! Its now: " + str(len(videos)) + " units long")
         for video in videos:            
             print("Now running for " + str(video))
-            URL = video.get_attribute("data-src")
-                
-            if isinstance(URL, str):
+
+            if isinstance(video, str):
                 if args.debug:
-                    print("URL read as: " + URL)
-                name = tags + "-" + hashlib.md5(URL.encode('utf-8')).hexdigest() + ".mp4"
+                    print("URL read as: " + video)
+                name = tags + "-" + hashlib.md5(video.encode('utf-8')).hexdigest() + ".mp4"
+                path = '/home/' + os.environ['USER'] + '/Videos/unsorted/' + name
                 if args.debug:
                     print("name read as: " + name)
-                    print("saving " + video.get_attribute("data-src") + " as " + name)
-                for tries in range(100):
-                    try:
-                        urllib.request.urlretrieve(video.get_attribute("data-src"), '/home/' + os.environ['USER'] + '/Videos/unsorted/' + name) 
-                    except:
+                    print("saving " + video + " as " + name)
+                if os.path.exists(path):
+                    print(name + " already exists!")
+                else:
+                    for tries in range(100):
+                        try:
+                            urllib.request.urlretrieve(video, path) 
+                        except:
                             if tries < 100 - 1:
                                 print("Rate Limited! Sleeping for " + str(tries*1.5) + " seconds!")
                                 time.sleep(tries*1.5)
-                                continue
-                    break
-                    #urllib.request.urlretrieve(video.get_attribute("data-src"), '/home/' + os.environ['USER'] + '/Videos/unsorted/' + name) 
+                            continue
+                        break
+                #urllib.request.urlretrieve(video.get_attribute("data-src"), '/home/' + os.environ['USER'] + '/Videos/unsorted/' + name) 
 
             elif args.debug:
-                print("URL is NOT a string, it is a " + str(type(URL)))
-
-
+                print("URL is NOT a string, it is a " + str(type(video)))
 
-    driver.quit()
-else:
-    print("Neither Javascript or HTML was given!")
-    sys.exit(1)
-    
-#    This program is free software: you can redistribute it and/or modify
-#    it under the terms of the GNU Affero General Public License version 3 as published by
-#    the Free Software Foundation.
+#This program is free software: you can redistribute it and/or modify
+#it under the terms of the GNU Affero General Public License version 3 as published by
+#the Free Software Foundation.
 #
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU Affero General Public License for more details.
+#This program is distributed in the hope that it will be useful,
+#but WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#GNU Affero General Public License for more details.
 #
-#    You should have received a copy of the GNU Affero General Public License
-#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#You should have received a copy of the GNU Affero General Public License
+#along with this program.  If not, see <https://www.gnu.org/licenses/>.
author	msglm <msglm@techchud.xyz>	2023-01-11 20:51:08 -0600
committer	msglm <msglm@techchud.xyz>	2023-01-11 20:51:08 -0600
commit	85f166eeecf2d8ce60b606f39354c59542825b87 (patch)
tree	3fb92961e875ef162abcdb8403f0f6b83ad3bddd
parent	defaceb8c3b53d2aefbd7679390e915b20953c19 (diff)
download	comedyGenerator-85f166eeecf2d8ce60b606f39354c59542825b87.tar.gz comedyGenerator-85f166eeecf2d8ce60b606f39354c59542825b87.tar.bz2 comedyGenerator-85f166eeecf2d8ce60b606f39354c59542825b87.zip