src/indeedwatcher.nim


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

import rss
import webdriver
import osproc
import options
import os
import strutils
import parsetoml
import sequtils


#TODO make this create folders and files for this automatically upon first start up
let config = parsetoml.parseFile(getConfigDir() & "/indeedwatcher/config.toml")
let feeds = config["feeds"].getElems().mapIt(it.getStr())
let outdir = config["outdir"].getStr()
let chromedriverloc = config["chromedriver"].getStr()
let cachefileloc = getCacheDir() & "/indeedwatcher/listings.cache" 
var cache = splitLines(readFile(cachefileloc))

#Webdriver
let chromedriver = startProcess(chromedriverloc, "", ["--headless"])
sleep 5000
echo "connecting"
#TODO make the port configurable, some users may have something running here
let driver = newWebDriver("http://localhost:9515")
let session = driver.createSession()
var counter = 0

proc terminate() {.noconv.} = 
    echo "\nAcknowledged termination attempt..."
    echo "Writing Cache..."
    writeFile(cachefileloc, cache.join("\n"))
    echo "Closing Session..."
    session.close()
    echo "Killing Chromedriver..."
    terminate(chromedriver)
    echo "Dying!"
    quit()
setControlCHook(terminate)

for feed in feeds:

    #Getting the listing URLs from the feeds
    echo "now reading " & feed
    sleep 1000
    var rssFeedReply = getRSS(feed)
    
    for entry in rssFeedReply.items:
        #Sleep so indeed.com doesn't freak out
        if counter > 7:
            echo "resting for 7 seconds ..."
            sleep 7000
            counter = 0

        #Don't even bother visiting it if its in the cache
        var URL = entry.link
        let URLID = entry.link.split('&')[3]
        echo URL
        echo URLID
        echo any(cache, proc (input: string): bool = input.contains(URLID))
        if not any(cache, proc (input: string): bool = input.contains(URLID)):
            session.navigate(URL)
            counter = counter + 1
        
            #HTML Parser
            let jobTitle = session.findElement(".jobsearch-JobInfoHeader-title").get().getText()
            let fullDesc = session.findElement("#jobDescriptionText").get().getText()
            
            var employer: string
            try:
                employer = session.findElement(".jobsearch-InlineCompanyRating-companyHeader").get().getText()
            except UnpackDefect:
                employer = "None Listed"
            
            var salaryInfoAndJobType: string
            try:
                salaryInfoAndJobType = session.findelement("#salaryInfoAndJobType").get().gettext()
            except UnpackDefect:
                salaryInfoAndJobType = "None Listed"
            
            
            #Job Value Scorer
            
            #Output
            var output = """
            Title: $1
            Company: $2
            Salary Info and Job Type: $3
            URL : $4
            Description:
            $5
            """ % [jobTitle, employer, salaryInfoAndJobType, URL, fullDesc]
            writeFile(outdir & jobTitle.replace("/") & ".txt", output)
            cache.add(URL)

session.close()
terminate(chromedriver)
writeFile(cachefileloc, cache.join("\n"))