1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
|
import rss
import webdriver
import osproc
import options
import os
import strutils
import parsetoml
import sequtils
#TODO make this create folders and files for this automatically upon first start up
let config = parsetoml.parseFile(getConfigDir() & "/indeedwatcher/config.toml")
let feeds = config["feeds"].getElems().mapIt(it.getStr())
let outdir = config["outdir"].getStr()
let chromedriverloc = config["chromedriver"].getStr()
let cachefileloc = getCacheDir() & "/indeedwatcher/listings.cache"
var cache = splitLines(readFile(cachefileloc))
#Webdriver
let chromedriver = startProcess(chromedriverloc, "", ["--headless"])
sleep 5000
echo "connecting"
#TODO make the port configurable, some users may have something running here
let driver = newWebDriver("http://localhost:9515")
let session = driver.createSession()
var counter = 0
proc terminate() {.noconv.} =
echo "\nAcknowledged termination attempt..."
echo "Writing Cache..."
writeFile(cachefileloc, cache.join("\n"))
echo "Closing Session..."
session.close()
echo "Killing Chromedriver..."
terminate(chromedriver)
echo "Dying!"
quit()
setControlCHook(terminate)
for feed in feeds:
#Getting the listing URLs from the feeds
echo "now reading " & feed
sleep 1000
var rssFeedReply = getRSS(feed)
for entry in rssFeedReply.items:
#Sleep so indeed.com doesn't freak out
if counter > 7:
echo "resting for 7 seconds ..."
sleep 7000
counter = 0
#Don't even bother visiting it if its in the cache
var URL = entry.link
let URLID = entry.link.split('&')[3]
echo URL
echo URLID
echo any(cache, proc (input: string): bool = input.contains(URLID))
if not any(cache, proc (input: string): bool = input.contains(URLID)):
session.navigate(URL)
counter = counter + 1
#HTML Parser
let jobTitle = session.findElement(".jobsearch-JobInfoHeader-title").get().getText()
let fullDesc = session.findElement("#jobDescriptionText").get().getText()
var employer: string
try:
employer = session.findElement(".jobsearch-InlineCompanyRating-companyHeader").get().getText()
except UnpackDefect:
employer = "None Listed"
var salaryInfoAndJobType: string
try:
salaryInfoAndJobType = session.findelement("#salaryInfoAndJobType").get().gettext()
except UnpackDefect:
salaryInfoAndJobType = "None Listed"
#Job Value Scorer
#Output
var output = """
Title: $1
Company: $2
Salary Info and Job Type: $3
URL : $4
Description:
$5
""" % [jobTitle, employer, salaryInfoAndJobType, URL, fullDesc]
writeFile(outdir & jobTitle.replace("/") & ".txt", output)
cache.add(URL)
session.close()
terminate(chromedriver)
writeFile(cachefileloc, cache.join("\n"))
|