1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
|
import rss
import webdriver
import osproc
import options
import os
import strutils
import parsetoml
import sequtils
if not fileExists(getConfigDir() & "/indeedwatcher/config.toml"):
createDir(getConfigDir() & "/indeedwatcher/")
let defaultConfig = """
#Output directory of your porential job listings
outdir = ""
#Port you wish chromedriver to use
port = 9515
#Location of chromedriver
chromedriver = ""
#Array of RSS urls that you wish the program to parse
feeds = [ \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=contract&explvl=entry_level\", \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=temporary&explvl=entry_level\"]
#Phrases that, if they appear, will cause the job to be instantly thrown out
[blacklist]
title= [\"Senior\", \"Sr.\"]
"""
writeFile(getConfigDir() & "/indeedwatcher/config.toml", defaultConfig)
if not fileExists(getCacheDir() & "/indeedwatcher/listings.cache"):
createDir(getCacheDir() & "/indeedwatcher/")
writeFile(getCacheDir() & "/indeedwatcher/listings.cache", "")
#Reading the config file
let config = parsetoml.parseFile(getConfigDir() & "/indeedwatcher/config.toml")
##Main section of config
let feeds = config["feeds"].getElems().mapIt(it.getStr())
let outdir = config["outdir"].getStr()
##Cache section of config
let chromedriverloc = config["chromedriver"].getStr()
let cachefileloc = getCacheDir() & "/indeedwatcher/listings.cache"
let cache = open(cachefileloc, fmAppend)
##Filtering section of config
let titleblacklist = config["blacklist"]["title"].getElems().mapIt(it.getStr())
#Webdriver
let chromedriver = startProcess(chromedriverloc, "", ["--headless"])
sleep 5000
echo "connecting"
#TODO make the port configurable, some users may have something running here
let driver = newWebDriver("http://localhost:9515")
var session: Session
var counter = 0
#Behavior when CTRL+C
proc terminate() {.noconv.} =
echo "\nAcknowledged termination attempt..."
echo "Closing the Cache..."
cache.close()
echo "Closing Session..."
session.close()
echo "Killing Chromedriver..."
terminate(chromedriver)
echo "Dying!"
quit()
setControlCHook(terminate)
for feed in feeds:
session = driver.createSession()
sleep 3000
#Getting the listing URLs from the feeds
var rssFeedReply: RSS
for attempt in countup(0,3):
try:
echo "now reading " & feed
rssFeedReply = getRSS(feed)
except:
if attempt < 3:
echo "Recieved an error: trying again..."
continue
else:
raise
break
for entry in rssFeedReply.items:
#Sleep so indeed.com doesn't freak out
if counter > 7:
echo "resting for 10 seconds ..."
sleep 10000
counter = 0
#Don't even bother visiting it if its in the cache
var URL = entry.link
let URLID = entry.link.split('&')[3]
#This isn't cache.readFile().contains(URLID)
#because nim has no way to both open a file in append mode
#and also open it as reading. Therefore, this blunder, which
#creates a new file in memory, is used instead.
if not readFile(cachefileloc).contains(URLID):
for attempt in countup(0,3):
try:
echo "Telling chromium to navigate to " & URL
session.navigate(URL)
except:
if attempt < 3:
echo "Recieved an error: trying again..."
continue
else:
raise
break
counter = counter + 1
#HTML Parser
echo "Beginning to parse..."
let jobTitle = session.findElement(".jobsearch-JobInfoHeader-title").get().getText()
let fullDesc = session.findElement("#jobDescriptionText").get().getText()
var employer: string
try:
#This takes the location from the URL, removes all the junk around it, and replaced the URL pluses with actual spaces
#perhaps, a URL parsing library could have been used for this.
employer = entry.link.split('&')[1][2..^1].replace("+"," ")
except UnpackDefect:
employer = "None Listed"
var salaryInfoAndJobType: string
try:
salaryInfoAndJobType = session.findelement("#salaryInfoAndJobType").get().gettext()
except UnpackDefect:
salaryInfoAndJobType = "None Listed"
echo "Finishing the parse..."
#Filtering
if not any(titleblacklist, proc (input: string): bool = jobTitle.contains(input)):
echo "Beginning to write to file..."
#Output
var output = """
Title: $1
Company: $2
Salary Info and Job Type: $3
URL : $4
Description:
$5
""" % [jobTitle, employer, salaryInfoAndJobType, URL, fullDesc]
writeFile(outdir & jobTitle.replace("/") & ".txt", output)
echo "Wrote job to file!"
cache.writeLine(URL)
echo "Wrote listing to cache!"
else:
echo "Trigger was hit, discarding " & URL
else:
echo URL & " was in cache, discarding"
session.close()
cache.close()
#session.close()
terminate(chromedriver)
|