import rss import webdriver import osproc import options import os import strutils import parsetoml import sequtils if not fileExists(getConfigDir() & "/indeedwatcher/config.toml"): createDir(getConfigDir() & "/indeedwatcher/") let defaultConfig = """ #Output directory of your porential job listings outdir = "" #Port you wish chromedriver to use port = 9515 #Location of chromedriver chromedriver = "" #Array of RSS urls that you wish the program to parse feeds = [ \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=contract&explvl=entry_level\", \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=temporary&explvl=entry_level\"] #Phrases that, if they appear, will cause the job to be instantly thrown out [blacklist] title= [\"Senior\", \"Sr.\"] """ writeFile(getConfigDir() & "/indeedwatcher/config.toml", defaultConfig) if not fileExists(getCacheDir() & "/indeedwatcher/config.toml"): createDir(getCacheDir() & "/indeedwatcher/") writeFile(getCacheDir() & "/indeedwatcher/listings.cache", "") #TODO make this create folders and files for this automatically upon first start up #Reading the config file let config = parsetoml.parseFile(getConfigDir() & "/indeedwatcher/config.toml") ##Main section of config let feeds = config["feeds"].getElems().mapIt(it.getStr()) let outdir = config["outdir"].getStr() ##Cache section of config let chromedriverloc = config["chromedriver"].getStr() let cachefileloc = getCacheDir() & "/indeedwatcher/listings.cache" var cache = splitLines(readFile(cachefileloc)) ##Filtering section of config let titleblacklist = config["blacklist"]["title"].getElems().mapIt(it.getStr()) #Webdriver let chromedriver = startProcess(chromedriverloc, "", ["--headless"]) sleep 5000 echo "connecting" #TODO make the port configurable, some users may have something running here let driver = newWebDriver("http://localhost:9515") let session = driver.createSession() var counter = 0 proc terminate() {.noconv.} = echo "\nAcknowledged termination attempt..." echo "Writing Cache..." writeFile(cachefileloc, cache.join("\n")) echo "Closing Session..." session.close() echo "Killing Chromedriver..." terminate(chromedriver) echo "Dying!" quit() setControlCHook(terminate) for feed in feeds: #Getting the listing URLs from the feeds echo "now reading " & feed var rssFeedReply: RSS for attempt in countup(0,5): try: rssFeedReply = getRSS(feed) except: if attempt < 5 - 1: continue else: raise break for entry in rssFeedReply.items: #Sleep so indeed.com doesn't freak out if counter > 7: echo "resting for 7 seconds ..." sleep 7000 counter = 0 #Don't even bother visiting it if its in the cache var URL = entry.link let URLID = entry.link.split('&')[3] if not any(cache, proc (input: string): bool = input.contains(URLID)): session.navigate(URL) counter = counter + 1 #HTML Parser let jobTitle = session.findElement(".jobsearch-JobInfoHeader-title").get().getText() let fullDesc = session.findElement("#jobDescriptionText").get().getText() var employer: string try: employer = session.findElement(".jobsearch-InlineCompanyRating-companyHeader").get().getText() except UnpackDefect: employer = "None Listed" var salaryInfoAndJobType: string try: salaryInfoAndJobType = session.findelement("#salaryInfoAndJobType").get().gettext() except UnpackDefect: salaryInfoAndJobType = "None Listed" #Filtering if not any(titleblacklist, proc (input: string): bool = jobTitle.contains(input)): #Output var output = """ Title: $1 Company: $2 Salary Info and Job Type: $3 URL : $4 Description: $5 """ % [jobTitle, employer, salaryInfoAndJobType, URL, fullDesc] writeFile(outdir & jobTitle.replace("/") & ".txt", output) cache.add(URL) else: echo "Trigger was hit, discarding " & URL else: echo URL & " was in cache, discarding" session.close() terminate(chromedriver) writeFile(cachefileloc, cache.join("\n"))