From b9d8d41e2f1ad23ebe5d43d96f283822209f49b9 Mon Sep 17 00:00:00 2001 From: msglm Date: Wed, 11 Jan 2023 20:49:28 -0600 Subject: 4 big improvements config, if it doesn't exist, is created cache, if it doesn't exist, is created nimble build system added you can now blacklist words from job titles --- src/indeedwatcher.nim | 82 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 20 deletions(-) (limited to 'src') diff --git a/src/indeedwatcher.nim b/src/indeedwatcher.nim index 0f86a18..c1946dc 100644 --- a/src/indeedwatcher.nim +++ b/src/indeedwatcher.nim @@ -7,16 +7,49 @@ import strutils import parsetoml import sequtils +if not fileExists(getConfigDir() & "/indeedwatcher/config.toml"): + createDir(getConfigDir() & "/indeedwatcher/") + let defaultConfig = """ + #Output directory of your porential job listings + outdir = "" + #Port you wish chromedriver to use + port = 9515 + #Location of chromedriver + chromedriver = "" + #Array of RSS urls that you wish the program to parse + feeds = [ \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=contract&explvl=entry_level\", \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=temporary&explvl=entry_level\"] + + #Phrases that, if they appear, will cause the job to be instantly thrown out + [blacklist] + title= [\"Senior\", \"Sr.\"] + """ + writeFile(getConfigDir() & "/indeedwatcher/config.toml", defaultConfig) + +if not fileExists(getCacheDir() & "/indeedwatcher/config.toml"): + createDir(getCacheDir() & "/indeedwatcher/") + writeFile(getCacheDir() & "/indeedwatcher/listings.cache", "") + + #TODO make this create folders and files for this automatically upon first start up + +#Reading the config file let config = parsetoml.parseFile(getConfigDir() & "/indeedwatcher/config.toml") + +##Main section of config let feeds = config["feeds"].getElems().mapIt(it.getStr()) let outdir = config["outdir"].getStr() + +##Cache section of config let chromedriverloc = config["chromedriver"].getStr() let cachefileloc = getCacheDir() & "/indeedwatcher/listings.cache" var cache = splitLines(readFile(cachefileloc)) +##Filtering section of config +let titleblacklist = config["blacklist"]["title"].getElems().mapIt(it.getStr()) + + #Webdriver let chromedriver = startProcess(chromedriverloc, "", ["--headless"]) sleep 5000 @@ -42,9 +75,18 @@ for feed in feeds: #Getting the listing URLs from the feeds echo "now reading " & feed - sleep 1000 - var rssFeedReply = getRSS(feed) - + var rssFeedReply: RSS + for attempt in countup(0,5): + try: + rssFeedReply = getRSS(feed) + except: + if attempt < 5 - 1: + continue + else: + raise + break + + for entry in rssFeedReply.items: #Sleep so indeed.com doesn't freak out if counter > 7: @@ -55,9 +97,6 @@ for feed in feeds: #Don't even bother visiting it if its in the cache var URL = entry.link let URLID = entry.link.split('&')[3] - echo URL - echo URLID - echo any(cache, proc (input: string): bool = input.contains(URLID)) if not any(cache, proc (input: string): bool = input.contains(URLID)): session.navigate(URL) counter = counter + 1 @@ -78,20 +117,23 @@ for feed in feeds: except UnpackDefect: salaryInfoAndJobType = "None Listed" - - #Job Value Scorer - - #Output - var output = """ - Title: $1 - Company: $2 - Salary Info and Job Type: $3 - URL : $4 - Description: - $5 - """ % [jobTitle, employer, salaryInfoAndJobType, URL, fullDesc] - writeFile(outdir & jobTitle.replace("/") & ".txt", output) - cache.add(URL) + #Filtering + if not any(titleblacklist, proc (input: string): bool = jobTitle.contains(input)): + #Output + var output = """ + Title: $1 + Company: $2 + Salary Info and Job Type: $3 + URL : $4 + Description: + $5 + """ % [jobTitle, employer, salaryInfoAndJobType, URL, fullDesc] + writeFile(outdir & jobTitle.replace("/") & ".txt", output) + cache.add(URL) + else: + echo "Trigger was hit, discarding " & URL + else: + echo URL & " was in cache, discarding" session.close() terminate(chromedriver) -- cgit v1.2.3