summaryrefslogtreecommitdiffstats
path: root/src/indeedwatcher.nim
blob: ce5c2b117de6dbed67a96d82a0e77fb387078c43 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import rss
import webdriver
import osproc
import options
import os
import strutils
import parsetoml
import sequtils
import uri
import json

type
    indeedJobDesc = object
        URL: string
        jobName: string
        employer: string
        location: string
        URLID: string

if not fileExists(getConfigDir() & "/indeedwatcher/config.toml"):
    createDir(getConfigDir() & "/indeedwatcher/")
    let defaultConfig = """
    #Output directory of your porential job listings
    outdir = ""
    #Port you wish chromedriver to use
    port = 9515
    #Number of times to retry before failing
    retryNum = 30
    #Location of chromedriver
    chromedriver = "/usr/bin/chromedriver"
    #If you would like headless mode enabled or not
    headless = true
    #Array of RSS urls that you wish the program to parse
    feeds = [ \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=contract&explvl=entry_level\", \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=temporary&explvl=entry_level\"]
    
    #Phrases that, if they appear, will cause the job to be instantly thrown out
    [blacklist]
    title= [\"Senior\", \"Sr.\"]
    employer= [\"NSA\"]
    location= [\"Ohio\"]
    """
    writeFile(getConfigDir() & "/indeedwatcher/config.toml", defaultConfig)

if not fileExists(getCacheDir() & "/indeedwatcher/listings.cache"):
    createDir(getCacheDir() & "/indeedwatcher/")
    writeFile(getCacheDir() & "/indeedwatcher/listings.cache", "")


#Reading the config file
let config = parsetoml.parseFile(getConfigDir() & "/indeedwatcher/config.toml")

##Main section of config
let feeds = config["feeds"].getElems().mapIt(it.getStr())
let outdir = config["outdir"].getStr()
let retryNum = config["retryNum"].getInt()
let driverURL = "http://localhost:" & config["port"].getInt().intToStr()
##Cache section of config
let chromedriverloc = config["chromedriver"].getStr()
let cachefileloc = getCacheDir() & "/indeedwatcher/listings.cache" 
let cache = open(cachefileloc, fmAppend)

##Filtering section of config
let titleblacklist = config["blacklist"]["title"].getElems().mapIt(it.getStr())
let employerblacklist = config["blacklist"]["employer"].getElems().mapIt(it.getStr())
let locationblacklist = config["blacklist"]["location"].getElems().mapIt(it.getStr())

##Does the user desire headlessness?
var args: JsonNode
if config["headless"].getBool():
    args = %*{ "capabilities": {"alwaysMatch": { "goog:chromeOptions": { "args": ["headless", "lang=en_US", "window-size=1920,1080", "start-maximized", "user-agent=\"Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/109.0\""], } } } }

else:
    args = %*{ "capabilities": {"alwaysMatch": { "goog:chromeOptions": { "args": ["start-maximized"] } } } }

#Webdriver
let chromedriver = startProcess(chromedriverloc)
sleep 5000
echo "connecting"
#TODO make the port configurable, some users may have something running here
let driver = newWebDriver(driverURL)
var session: Session
var feedcounter = 0
var urlcounter = 0

#Behavior when CTRL+C
proc terminate() {.noconv.} = 
    echo "\nAcknowledged termination attempt..."
    echo "Closing the Cache..."
    cache.close()
    echo "Closing Session..."
    session.close()
    echo "Killing Chromedriver..."
    terminate(chromedriver)
    echo "Dying!"
    quit()
setControlCHook(terminate)

proc postValid(posting: indeedJobDesc) : bool =
    if any(titleblacklist, proc (input: string): bool = posting.jobName.contains(input)) and
       any(employerblacklist, proc (input: string): bool = posting.employer.contains(input)) and
       any(locationblacklist, proc (input: string): bool = posting.location.contains(input)):
           return true
    else:
        return false

for feed in feeds:
    #let args = %*{"desiredCapabilities":{"browserName":"chromium"}}
    session = driver.createSession(args)
    if feedcounter > 3:
        echo "resting for 20 seconds ..."
        sleep 20000
        feedcounter = 0
    #Getting the listing URLs from the feeds
    var rssFeedReply: RSS
    for attempt in countup(0,3):
            try:
                echo "now reading " & feed
                rssFeedReply = getRSS(feed)
            except:
                if attempt < 30:
                    var attemptTime = 10000*attempt
                    echo "Recieved an error: trying again in " & $(attemptTime/1000) & " seconds..."
                    sleep attemptTime
                    continue
                else:
                    raise
            break

    for entry in rssFeedReply.items:
        #Sleep so indeed.com doesn't freak out
        if urlcounter > 7:
            echo "resting for 10 seconds ..."
            sleep 10000
            urlcounter = 0

        #Don't even bother visiting it if its in the cache or hits a trigger word
        let queries = entry.link.parseUri.query.decodeQuery().toSeq()
        var posting: indeedJobDesc
        posting = indeedJobDesc(URL: entry.link, jobName: queries[0].value, employer: queries[1].value, location: queries[2].value, URLID: queries[3].value)

        #This isn't cache.readFile().contains(URLID)
        #because nim has no way to both open a file in append mode
        #and also open it as reading. Therefore, this blunder, which
        #creates a new file in memory, is used instead.
        if not readFile(cachefileloc).contains(posting.URLID) and not postValid(posting):
            for attempt in countup(0,retryNum):
                try:
                    echo "Telling chromium to navigate to " & posting.URL
                    session.navigate(posting.URL)
                except:
                    if attempt < retryNum:
                        echo "Recieved an error: trying again..."
                        continue
                    else:
                        raise
                break
            urlcounter = urlcounter + 1
        
            #HTML Parser
            echo "Beginning to parse desc..."
            var fullDesc = session.findElement("/html/body/div/div[2]/div/div[4]/div/div/div[1]/div[1]/div[5]/div[5]", strategy=XPathSelector).get().getText()
            
            echo "Beginning to parse salary info and job type..."
            var salaryInfoAndJobType: string
            try:
                salaryInfoAndJobType = session.findelement("#salaryInfoAndJobType").get().gettext()
            except UnpackDefect:
                salaryInfoAndJobType = "None Listed"
            
            echo "Beginning to write to file..."
            #Output
            var output = """
            Title: $1
            Employer: $2
            Location: $3
            Salary Info and Job Type: $4
            URL : $5
            Description:
            $6
            """ % [posting.jobName, posting.employer, posting.location, salaryInfoAndJobType, posting.URL, fullDesc]
            writeFile(outdir & posting.employer.replace("/") & " - " & posting.jobName.replace("/") & ".txt", output)
            echo "Wrote job to file!"
            cache.writeLine(posting.URL)
            echo "Wrote listing to cache!"
        else:
            echo posting.URL & " was in cache or hit a trigger word, discarding"
    session.close()


cache.close()
terminate(chromedriver)