import rss import webdriver import osproc import options import os import strutils import parsetoml import sequtils if not fileExists(getConfigDir() & "/indeedwatcher/config.toml"): createDir(getConfigDir() & "/indeedwatcher/") let defaultConfig = """ #Output directory of your porential job listings outdir = "" #Port you wish chromedriver to use port = 9515 #Location of chromedriver chromedriver = "" #Array of RSS urls that you wish the program to parse feeds = [ \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=contract&explvl=entry_level\", \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=temporary&explvl=entry_level\"] #Phrases that, if they appear, will cause the job to be instantly thrown out [blacklist] title= [\"Senior\", \"Sr.\"] """ writeFile(getConfigDir() & "/indeedwatcher/config.toml", defaultConfig) if not fileExists(getCacheDir() & "/indeedwatcher/config.toml"): createDir(getCacheDir() & "/indeedwatcher/") writeFile(getCacheDir() & "/indeedwatcher/listings.cache", "") #Reading the config file let config = parsetoml.parseFile(getConfigDir() & "/indeedwatcher/config.toml") ##Main section of config let feeds = config["feeds"].getElems().mapIt(it.getStr()) let outdir = config["outdir"].getStr() ##Cache section of config let chromedriverloc = config["chromedriver"].getStr() let cachefileloc = getCacheDir() & "/indeedwatcher/listings.cache" let cache = open(cachefileloc, fmAppend) ##Filtering section of config let titleblacklist = config["blacklist"]["title"].getElems().mapIt(it.getStr()) #Webdriver let chromedriver = startProcess(chromedriverloc, "", ["--headless"]) sleep 5000 echo "connecting" #TODO make the port configurable, some users may have something running here let driver = newWebDriver("http://localhost:9515") var session: Session var counter = 0 #Behavior when CTRL+C proc terminate() {.noconv.} = echo "\nAcknowledged termination attempt..." echo "Closing the Cache..." cache.close() echo "Closing Session..." session.close() echo "Killing Chromedriver..." terminate(chromedriver) echo "Dying!" quit() setControlCHook(terminate) for feed in feeds: session = driver.createSession() sleep 3000 #Getting the listing URLs from the feeds var rssFeedReply: RSS for attempt in countup(0,3): try: echo "now reading " & feed rssFeedReply = getRSS(feed) except: if attempt < 3: echo "Recieved an error: trying again..." continue else: raise break for entry in rssFeedReply.items: #Sleep so indeed.com doesn't freak out if counter > 7: echo "resting for 7 seconds ..." sleep 7000 counter = 0 #Don't even bother visiting it if its in the cache var URL = entry.link let URLID = entry.link.split('&')[3] #This isn't cache.readFile().contains(URLID) #because nim has no way to both open a file in append mode #and also open it as reading. Therefore, this blunder, which #creates a new file in memory, is used instead. if not readFile(cachefileloc).contains(URLID): for attempt in countup(0,3): try: echo "Telling chromium to navigate to " & URL session.navigate(URL) except: if attempt < 3: echo "Recieved an error: trying again..." continue else: raise break counter = counter + 1 #HTML Parser echo "Beginning to parse..." let jobTitle = session.findElement(".jobsearch-JobInfoHeader-title").get().getText() let fullDesc = session.findElement("#jobDescriptionText").get().getText() var employer: string try: #This takes the location from the URL, removes all the junk around it, and replaced the URL pluses with actual spaces #perhaps, a URL parsing library could have been used for this. employer = entry.link.split('&')[1][2..^1].replace("+"," ") except UnpackDefect: employer = "None Listed" var salaryInfoAndJobType: string try: salaryInfoAndJobType = session.findelement("#salaryInfoAndJobType").get().gettext() except UnpackDefect: salaryInfoAndJobType = "None Listed" echo "Finishing the parse..." #Filtering if not any(titleblacklist, proc (input: string): bool = jobTitle.contains(input)): echo "Beginning to write to file..." #Output var output = """ Title: $1 Company: $2 Salary Info and Job Type: $3 URL : $4 Description: $5 """ % [jobTitle, employer, salaryInfoAndJobType, URL, fullDesc] writeFile(outdir & jobTitle.replace("/") & ".txt", output) echo "Wrote job to file!" cache.writeLine(URL) echo "Wrote listing to cache!" else: echo "Trigger was hit, discarding " & URL else: echo URL & " was in cache, discarding" session.close() cache.close() #session.close() terminate(chromedriver)