diff options
author | msglm <msglm@techchud.xyz> | 2023-01-11 20:49:27 -0600 |
---|---|---|
committer | msglm <msglm@techchud.xyz> | 2023-01-11 20:49:27 -0600 |
commit | 40fc1eb64fb8da26544bd2e2b4855813ffb2f244 (patch) | |
tree | 38247b31b502881ad026de586081d5f16c13cd19 | |
parent | 07e77fb6dc8a5ab9472db86f269c3a844199add1 (diff) | |
download | indeedwatcher-40fc1eb64fb8da26544bd2e2b4855813ffb2f244.tar.gz indeedwatcher-40fc1eb64fb8da26544bd2e2b4855813ffb2f244.tar.bz2 indeedwatcher-40fc1eb64fb8da26544bd2e2b4855813ffb2f244.zip |
cache system actually works now
-rw-r--r-- | spec/flowchart.gv | 3 | ||||
-rw-r--r-- | spec/flowchart.png | bin | 134627 -> 120196 bytes | |||
-rw-r--r-- | src/indeedwatcher.nim | 148 |
3 files changed, 80 insertions, 71 deletions
diff --git a/spec/flowchart.gv b/spec/flowchart.gv index 6d181ec..1a41936 100644 --- a/spec/flowchart.gv +++ b/spec/flowchart.gv @@ -16,8 +16,7 @@ config -> "Job Value Scorer" [label="Priorities"] config -> "Cache System" [label="Cache location"] "Job Value Scorer" -> "Cache System" [label="URL of currently parsing job post"] "Cache System" -> "Job Value Scorer" [label="True or False to if URL is in database"] -"Job Value Scorer" -> "latexdsl" [label="Formatted job data"] -"latexdsl" -> "Saver" [label="PDF file"] +"Job Value Scorer" -> "Saver" [label="Formatted job data"] config -> "Saver" [label="save location"] "Saver" -> "Filesystem" [label="PDF file"] diff --git a/spec/flowchart.png b/spec/flowchart.png Binary files differindex 8c6bc0d..e74f0fe 100644 --- a/spec/flowchart.png +++ b/spec/flowchart.png diff --git a/src/indeedwatcher.nim b/src/indeedwatcher.nim index 1807e93..2e5ff87 100644 --- a/src/indeedwatcher.nim +++ b/src/indeedwatcher.nim @@ -4,83 +4,93 @@ import osproc import options import os import strutils +import parsetoml +import sequtils -#Feednim -var test = getRSS("https://rss.indeed.com/rss?q=Linux&l=Arkansas&explvl=mid_level") -var URL = test.items[5].link + + +#TODO make this create folders and files for this automatically upon first start up +let config = parsetoml.parseFile(getConfigDir() & "/indeedwatcher/config.toml") +let feeds = config["feeds"].getElems().mapIt(it.getStr()) +let outdir = config["outdir"].getStr() +let chromedriverloc = config["chromedriver"].getStr() +let cachefileloc = getCacheDir() & "/indeedwatcher/listings.cache" +var cache = splitLines(readFile(cachefileloc)) #Webdriver -#TODO put location of chromedriver into config -let chromedriver = startProcess("/usr/bin/chromedriver") +let chromedriver = startProcess(chromedriverloc, "", ["--headless", "--disable-gpu"]) sleep 5000 echo "connecting" +#TODO make the port configurable, some users may have something running here let driver = newWebDriver("http://localhost:9515") let session = driver.createSession() -session.navigate(URL) - -#HTML Parser -var jobTimes: string -var salaryGuide: string - -let jobTitle = session.findElement(".jobsearch-JobInfoHeader-title").get().getText() -let employer = session.findElement(".jobsearch-CompanyReview--heading").get().getText() - -try: - jobTimes = session.findElement(".jobsearch-JobDescriptionSection-sectionItem").get().getText() -except UnpackDefect: - jobTimes = "" +var counter = 0 + +proc terminate() {.noconv.} = + echo "\nAcknowledged termination attempt..." + echo "Writing Cache..." + writeFile(cachefileloc, cache.join("\n")) + echo "Closing Session..." + session.close() + echo "Killing Chromedriver..." + terminate(chromedriver) + echo "Dying!" + quit() +setControlCHook(terminate) + +for feed in feeds: + echo "now reading " & feed + + #Feednim + var rssFeedReply = getRSS(feed) + + for entry in rssFeedReply.items: + echo entry.link + #Logging + + if counter > 7: + echo "resting for 7 seconds ..." + sleep 7000 + counter = 0 + var URL = entry.link + let URLID = entry.link.split('&')[4] + echo any(cache, proc (input: string): bool = input.contains(URLID)) + if not any(cache, proc (input: string): bool = input.contains(URLID)): + session.navigate(URL) + counter = counter + 1 + + #HTML Parser + var salaryInfoAndJobType: string + var employer: string + let jobTitle = session.findElement(".jobsearch-JobInfoHeader-title").get().getText() + + try: + employer = session.findElement(".jobsearch-InlineCompanyRating-companyHeader").get().getText() + except UnpackDefect: + salaryInfoAndJobType = "None Listed" + + try: + salaryInfoAndJobType = session.findelement("#salaryInfoAndJobType").get().gettext() + except UnpackDefect: + salaryInfoAndJobType = "None Listed" + + let fullDesc = session.findElement("#jobDescriptionText").get().getText() + + #Job Value Scorer + + #Output + var output = """ + Title: $1 + Company: $2 + Salary Info and Job Type: $3 + URL : $4 + Description: + $5 + """ % [jobTitle, employer, salaryInfoAndJobType, URL, fullDesc] + writeFile(outdir & jobTitle.replace("/") & ".txt", output) + cache.add(URL) -try: - salaryGuide = session.findelement("#salaryGuide").get().gettext() -except UnpackDefect: - salaryGuide = "None Listed" - -if salaryGuide.contains("Not provided by employer"): - salaryGuide = "None Listed" - -let fullDesc = session.findElement("#jobDescriptionText").get().getText() session.close() terminate(chromedriver) +writeFile(cachefileloc, cache.join("\n")) -#Job Value Scorer - -#Parsing Salary - -#Output -var output = """ -\documentclass{article} -\usepackage[margin=0.7in]{geometry} -\usepackage{pdfpages} -\usepackage{hyperref} -\hypersetup{ - colorlinks=true, - linkcolor=black, - filecolor=magenta, - urlcolor=blue, - } - - -\begin{document} - -\begin{center} - \Huge{$1} -\end{center} -\hrulefill - - \large{$2} - \hspace{3cm} - \large{$3} - \hspace{3cm} - \large{\href{$4}{URL}} - \hspace{3cm} - \large{$5} - -\hrulefill - -$6 - -\end{document} -""" % [jobTitle, employer, salaryGuide, URL.replace("&","\\&"), jobTimes, fullDesc.replace("#", "\\#").replace("&", "\\&").replace("\"", "\\\"").replace("'","\\'")] - -writeFile("/tmp/test.tex", output) -discard execCmd( "latexmk -pdf -pvc /tmp/test.tex") |