src/indeedwatcher.nim


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147

import rss
import webdriver
import osproc
import options
import os
import strutils
import parsetoml
import sequtils

if not fileExists(getConfigDir() & "/indeedwatcher/config.toml"):
    createDir(getConfigDir() & "/indeedwatcher/")
    let defaultConfig = """
    #Output directory of your porential job listings
    outdir = ""
    #Port you wish chromedriver to use
    port = 9515
    #Location of chromedriver
    chromedriver = ""
    #Array of RSS urls that you wish the program to parse
    feeds = [ \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=contract&explvl=entry_level\", \"https://rss.indeed.com/rss?q=Information%20Technology&l=Remote&jt=temporary&explvl=entry_level\"]
    
    #Phrases that, if they appear, will cause the job to be instantly thrown out
    [blacklist]
    title= [\"Senior\", \"Sr.\"]
    """
    writeFile(getConfigDir() & "/indeedwatcher/config.toml", defaultConfig)

if not fileExists(getCacheDir() & "/indeedwatcher/config.toml"):
    createDir(getCacheDir() & "/indeedwatcher/")
    writeFile(getCacheDir() & "/indeedwatcher/listings.cache", "")


#Reading the config file
let config = parsetoml.parseFile(getConfigDir() & "/indeedwatcher/config.toml")

##Main section of config
let feeds = config["feeds"].getElems().mapIt(it.getStr())
let outdir = config["outdir"].getStr()

##Cache section of config
let chromedriverloc = config["chromedriver"].getStr()
let cachefileloc = getCacheDir() & "/indeedwatcher/listings.cache" 
var cache = splitLines(readFile(cachefileloc))

##Filtering section of config
let titleblacklist = config["blacklist"]["title"].getElems().mapIt(it.getStr())


#Webdriver
let chromedriver = startProcess(chromedriverloc, "", ["--headless"])
sleep 5000
echo "connecting"
#TODO make the port configurable, some users may have something running here
let driver = newWebDriver("http://localhost:9515")
let session = driver.createSession()
var counter = 0

proc terminate() {.noconv.} = 
    echo "\nAcknowledged termination attempt..."
    echo "Writing Cache..."
    writeFile(cachefileloc, cache.join("\n"))
    echo "Closing Session..."
    session.close()
    echo "Killing Chromedriver..."
    terminate(chromedriver)
    echo "Dying!"
    quit()
setControlCHook(terminate)

for feed in feeds:

    #Getting the listing URLs from the feeds
    var rssFeedReply: RSS
    for attempt in countup(0,3):
            try:
                echo "now reading " & feed
                rssFeedReply = getRSS(feed)
            except:
                if attempt < 3:
                    echo "Recieved an error: trying again..."
                    continue
                else:
                    raise
            break

    for entry in rssFeedReply.items:
        #Sleep so indeed.com doesn't freak out
        if counter > 7:
            echo "resting for 7 seconds ..."
            sleep 7000
            counter = 0

        #Don't even bother visiting it if its in the cache
        var URL = entry.link
        let URLID = entry.link.split('&')[3]
        if not any(cache, proc (input: string): bool = input.contains(URLID)):
            for attempt in countup(0,3):
                try:
                    echo "Telling chromium to navigate to " & URL
                    session.navigate(URL)
                except:
                    if attempt < 3:
                        echo "Recieved an error: trying again..."
                        continue
                    else:
                        raise
                break
            counter = counter + 1
        
            #HTML Parser
            let jobTitle = session.findElement(".jobsearch-JobInfoHeader-title").get().getText()
            let fullDesc = session.findElement("#jobDescriptionText").get().getText()
            
            var employer: string
            try:
                employer = session.findElement(".jobsearch-InlineCompanyRating-companyHeader").get().getText()
            except UnpackDefect:
                employer = "None Listed"
            
            var salaryInfoAndJobType: string
            try:
                salaryInfoAndJobType = session.findelement("#salaryInfoAndJobType").get().gettext()
            except UnpackDefect:
                salaryInfoAndJobType = "None Listed"
            
            #Filtering
            if not any(titleblacklist, proc (input: string): bool = jobTitle.contains(input)):
                 #Output
                 var output = """
                 Title: $1
                 Company: $2
                 Salary Info and Job Type: $3
                 URL : $4
                 Description:
                 $5
                 """ % [jobTitle, employer, salaryInfoAndJobType, URL, fullDesc]
                 writeFile(outdir & jobTitle.replace("/") & ".txt", output)
                 cache.add(URL)
            else:
                echo "Trigger was hit, discarding " & URL
        else:
            echo URL & " was in cache, discarding"

session.close()
terminate(chromedriver)
writeFile(cachefileloc, cache.join("\n"))