import rss import webdriver import osproc import options import os import strutils import parsetoml import sequtils import uri import json type indeedJobDesc = object URL: string jobName: string employer: string location: string URLID: string if not fileExists(getConfigDir() & "/indeedwatcher/config.toml"): createDir(getConfigDir() & "/indeedwatcher/") let defaultConfig = """ #Output directory of your porential job listings outdir = "" #Port you wish chromedriver to use port = 9515 #Number of times to retry before failing retryNum = 30 #Location of chromedriver chromedriver = "/usr/bin/chromedriver" #If you would like headless mode enabled or not headless = true #Array of RSS urls that you wish the program to parse feeds = [ \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=contract&explvl=entry_level\", \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=temporary&explvl=entry_level\"] #Phrases that, if they appear, will cause the job to be instantly thrown out [blacklist] title= [\"Senior\", \"Sr.\"] employer= [\"NSA\"] location= [\"Ohio\"] """ writeFile(getConfigDir() & "/indeedwatcher/config.toml", defaultConfig) if not fileExists(getCacheDir() & "/indeedwatcher/listings.cache"): createDir(getCacheDir() & "/indeedwatcher/") writeFile(getCacheDir() & "/indeedwatcher/listings.cache", "") #Reading the config file let config = parsetoml.parseFile(getConfigDir() & "/indeedwatcher/config.toml") ##Main section of config let feeds = config["feeds"].getElems().mapIt(it.getStr()) let outdir = config["outdir"].getStr() let retryNum = config["retryNum"].getInt() let driverURL = "http://localhost:" & config["port"].getInt().intToStr() ##Cache section of config let chromedriverloc = config["chromedriver"].getStr() let cachefileloc = getCacheDir() & "/indeedwatcher/listings.cache" let cache = open(cachefileloc, fmAppend) ##Filtering section of config let titleblacklist = config["blacklist"]["title"].getElems().mapIt(it.getStr()) let employerblacklist = config["blacklist"]["employer"].getElems().mapIt(it.getStr()) let locationblacklist = config["blacklist"]["location"].getElems().mapIt(it.getStr()) ##Does the user desire headlessness? var args: JsonNode if config["headless"].getBool(): args = %*{ "capabilities": {"alwaysMatch": { "goog:chromeOptions": { "args": ["headless", "lang=en_US", "window-size=1920,1080", "start-maximized", "user-agent=\"Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/109.0\""], } } } } else: args = %*{ "capabilities": {"alwaysMatch": { "goog:chromeOptions": { "args": ["start-maximized"] } } } } #Webdriver let chromedriver = startProcess(chromedriverloc) sleep 5000 echo "connecting" #TODO make the port configurable, some users may have something running here let driver = newWebDriver(driverURL) var session: Session var feedcounter = 0 var urlcounter = 0 #Behavior when CTRL+C proc terminate() {.noconv.} = echo "\nAcknowledged termination attempt..." echo "Closing the Cache..." cache.close() echo "Closing Session..." session.close() echo "Killing Chromedriver..." terminate(chromedriver) echo "Dying!" quit() setControlCHook(terminate) proc postValid(posting: indeedJobDesc) : bool = if any(titleblacklist, proc (input: string): bool = posting.jobName.contains(input)) and any(employerblacklist, proc (input: string): bool = posting.employer.contains(input)) and any(locationblacklist, proc (input: string): bool = posting.location.contains(input)): return true else: return false for feed in feeds: #let args = %*{"desiredCapabilities":{"browserName":"chromium"}} session = driver.createSession(args) if feedcounter > 3: echo "resting for 20 seconds ..." sleep 20000 feedcounter = 0 #Getting the listing URLs from the feeds var rssFeedReply: RSS for attempt in countup(0,3): try: echo "now reading " & feed rssFeedReply = getRSS(feed) except: if attempt < 30: var attemptTime = 10000*attempt echo "Recieved an error: trying again in " & $(attemptTime/1000) & " seconds..." sleep attemptTime continue else: raise break for entry in rssFeedReply.items: #Sleep so indeed.com doesn't freak out if urlcounter > 7: echo "resting for 10 seconds ..." sleep 10000 urlcounter = 0 #Don't even bother visiting it if its in the cache or hits a trigger word let queries = entry.link.parseUri.query.decodeQuery().toSeq() var posting: indeedJobDesc posting = indeedJobDesc(URL: entry.link, jobName: queries[0].value, employer: queries[1].value, location: queries[2].value, URLID: queries[3].value) #This isn't cache.readFile().contains(URLID) #because nim has no way to both open a file in append mode #and also open it as reading. Therefore, this blunder, which #creates a new file in memory, is used instead. if not readFile(cachefileloc).contains(posting.URLID) and not postValid(posting): for attempt in countup(0,retryNum): try: echo "Telling chromium to navigate to " & posting.URL session.navigate(posting.URL) except: if attempt < retryNum: echo "Recieved an error: trying again..." continue else: raise break urlcounter = urlcounter + 1 #HTML Parser echo "Beginning to parse desc..." var fullDesc = session.findElement("/html/body/div/div[2]/div/div[4]/div/div/div[1]/div[1]/div[5]/div[5]", strategy=XPathSelector).get().getText() echo "Beginning to parse salary info and job type..." var salaryInfoAndJobType: string try: salaryInfoAndJobType = session.findelement("#salaryInfoAndJobType").get().gettext() except UnpackDefect: salaryInfoAndJobType = "None Listed" echo "Beginning to write to file..." #Output var output = """ Title: $1 Employer: $2 Location: $3 Salary Info and Job Type: $4 URL : $5 Description: $6 """ % [posting.jobName, posting.employer, posting.location, salaryInfoAndJobType, posting.URL, fullDesc] writeFile(outdir & posting.employer.replace("/") & " - " & posting.jobName.replace("/") & ".txt", output) echo "Wrote job to file!" cache.writeLine(posting.URL) echo "Wrote listing to cache!" else: echo posting.URL & " was in cache or hit a trigger word, discarding" session.close() cache.close() terminate(chromedriver)