From c0f5799ae651b0ee3651e20302446475bcac17e0 Mon Sep 17 00:00:00 2001 From: msglm Date: Tue, 14 Feb 2023 23:31:33 -0600 Subject: release 1.0.5 Trigger words discard much earlier using the job name in the URL, increasing stability and speed. deprecated old way of parsing urls for nim's uri library for readability's sake. longer waiting times for both rss feeds and opening URLs. cache no longer is overwritten on start up --- indeedwatcher.nimble | 2 +- src/indeedwatcher.nim | 69 ++++++++++++++++++++++++--------------------------- 2 files changed, 34 insertions(+), 37 deletions(-) diff --git a/indeedwatcher.nimble b/indeedwatcher.nimble index c647a88..cf69199 100644 --- a/indeedwatcher.nimble +++ b/indeedwatcher.nimble @@ -1,6 +1,6 @@ # Package -version = "1.0.4" +version = "1.0.5" author = "msglm" description = "Watches indeed for job updates." license = "AGPL-3.0-only" diff --git a/src/indeedwatcher.nim b/src/indeedwatcher.nim index bc52138..8c4a71e 100644 --- a/src/indeedwatcher.nim +++ b/src/indeedwatcher.nim @@ -6,6 +6,7 @@ import os import strutils import parsetoml import sequtils +import uri if not fileExists(getConfigDir() & "/indeedwatcher/config.toml"): createDir(getConfigDir() & "/indeedwatcher/") @@ -53,7 +54,8 @@ echo "connecting" #TODO make the port configurable, some users may have something running here let driver = newWebDriver("http://localhost:9515") var session: Session -var counter = 0 +var feedcounter = 0 +var urlcounter = 0 #Behavior when CTRL+C proc terminate() {.noconv.} = @@ -70,7 +72,10 @@ setControlCHook(terminate) for feed in feeds: session = driver.createSession() - sleep 3000 + if feedcounter > 3: + echo "resting for 20 seconds ..." + sleep 20000 + feedcounter = 0 #Getting the listing URLs from the feeds var rssFeedReply: RSS for attempt in countup(0,3): @@ -87,20 +92,24 @@ for feed in feeds: for entry in rssFeedReply.items: #Sleep so indeed.com doesn't freak out - if counter > 7: + if urlcounter > 7: echo "resting for 10 seconds ..." sleep 10000 - counter = 0 + urlcounter = 0 - #Don't even bother visiting it if its in the cache + #Don't even bother visiting it if its in the cache or hits a trigger word var URL = entry.link - let URLID = entry.link.split('&')[3] + let queries = URL.parseUri.query.decodeQuery().toSeq() + let jobName = queries[0].value + let employer = queries[1].value + let location = queries[2].value + let URLID = queries[3].value #This isn't cache.readFile().contains(URLID) #because nim has no way to both open a file in append mode #and also open it as reading. Therefore, this blunder, which #creates a new file in memory, is used instead. - if not readFile(cachefileloc).contains(URLID): + if not readFile(cachefileloc).contains(URLID) or not any(titleblacklist, proc (input: string): bool = jobName.contains(input)): for attempt in countup(0,3): try: echo "Telling chromium to navigate to " & URL @@ -112,21 +121,12 @@ for feed in feeds: else: raise break - counter = counter + 1 + urlcounter = urlcounter + 1 #HTML Parser echo "Beginning to parse..." - let jobTitle = session.findElement(".jobsearch-JobInfoHeader-title").get().getText() let fullDesc = session.findElement("#jobDescriptionText").get().getText() - var employer: string - try: - #This takes the location from the URL, removes all the junk around it, and replaced the URL pluses with actual spaces - #perhaps, a URL parsing library could have been used for this. - employer = entry.link.split('&')[1][2..^1].replace("+"," ") - except UnpackDefect: - employer = "None Listed" - var salaryInfoAndJobType: string try: salaryInfoAndJobType = session.findelement("#salaryInfoAndJobType").get().gettext() @@ -134,26 +134,23 @@ for feed in feeds: salaryInfoAndJobType = "None Listed" echo "Finishing the parse..." - #Filtering - if not any(titleblacklist, proc (input: string): bool = jobTitle.contains(input)): - echo "Beginning to write to file..." - #Output - var output = """ - Title: $1 - Company: $2 - Salary Info and Job Type: $3 - URL : $4 - Description: - $5 - """ % [jobTitle, employer, salaryInfoAndJobType, URL, fullDesc] - writeFile(outdir & jobTitle.replace("/") & ".txt", output) - echo "Wrote job to file!" - cache.writeLine(URL) - echo "Wrote listing to cache!" - else: - echo "Trigger was hit, discarding " & URL + echo "Beginning to write to file..." + #Output + var output = """ + Title: $1 + Employer: $2 + Location: $3 + Salary Info and Job Type: $4 + URL : $5 + Description: + $6 + """ % [jobName, employer, location, salaryInfoAndJobType, URL, fullDesc] + writeFile(outdir & jobName.replace("/") & ".txt", output) + echo "Wrote job to file!" + cache.writeLine(URL) + echo "Wrote listing to cache!" else: - echo URL & " was in cache, discarding" + echo URL & " was in cache or hit a trigger word, discarding" session.close() -- cgit v1.2.3