-
Notifications
You must be signed in to change notification settings - Fork 5
/
parse_email_files.js
53 lines (43 loc) · 1.27 KB
/
parse_email_files.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
//
// Usage: parse_email_files.js -h
//
var opts = require('commander'),
walk = require('walk'),
fs = require('fs'),
path = require('path'),
md5 = require('md5'),
MailParser = require("mailparser").MailParser;
opts
.option('-s --source-dir [path]', 'Path to Enron emails base directory')
.option('-d --destination-dir [path]', 'Path to store parsed files', './dataset')
.parse(process.argv)
var srcDir = opts['sourceDir'],
destDir = opts['destinationDir'];
if (!srcDir || !destDir) {
opts.outputHelp();
process.exit(1);
}
// Walk path looking for email files
var walker = walk.walk(srcDir),
filePath,
seen = {};
walker.on('file', function(root, fileStats, next) {
var filePath = path.join(root, fileStats.name);
// Read file contents off disk
fs.readFile(filePath, function(err, data) {
var mailParser = new MailParser();
// Write parsed email to Elasticsearch (unless its a duplicate)
mailParser.on('end', function(mail) {
// De-dupe
var signature = md5(mail.from[0].address + mail.subject + mail.date.getTime());
if (!seen[signature]) {
seen[signature] = true;
fs.writeFile(path.join(destDir, signature + '.json'), JSON.stringify(mail));
}
})
// Parse file contents as EML format
mailParser.write(data);
mailParser.end();
next();
})
});