-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathisr-prog2.cpp
280 lines (229 loc) · 9.96 KB
/
isr-prog2.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
//Author: Nick Drafahl
//Class: Information Storage and Retrieval
//Usage: ./isr-prog2 <file1 [file2] ...>
//Compile: g++ -Wall -pedantic -ansi isr-prog2.cpp stemmer.o -o isr-prog2
#include <set>
#include <vector>
#include <iostream>
#include <string>
#include <fstream>
#include <cstdlib>
#include <iterator>
#include <algorithm>
// Custom structure that will store our unique tokens as well as the documents
// that they are found in.
struct token {
std::string sWord; // Token
std::vector <int> docList; // Vector of all documents token is found in (dupes allowed)
std::set <int> uniDocList; // Set of all documents token is found in (dupes not allowed)
int uniSize;
token(const std::string& x) : sWord(x) {}
void addDocNum(int docNum) {
docList.push_back(docNum);
uniDocList.insert(docNum);
}
friend bool operator< (const token &left, const token &right);
std::string getWord() { return sWord; };
std::set<int> getUniqueDocList() { return uniDocList; }
int getUniSize() { return uniDocList.size(); }
void updateUniSize() { uniSize = uniDocList.size(); }
};
// Function Prototypes
void printFileCollection (std::vector <std::string> fCol, int biggestWord);
std::vector<std::string> removePunc (std::string word);
bool compTokensLess(const token & t1, const token & t2);
bool compTokensLower(const token & t1, const std::string & t2);
void printTokenVector(std::vector<token> tVector);
void printIntegerSet(std::set<int> tSet);
extern int stem(char* p, int i, int j);
int main(int argc, char * argv[]) {
// Variable Declarations
std::vector<std::string> fileCollection;
std::vector<token> tokenVec;
std::set<token> tokenSet;
unsigned int biggestWord = 0;
std::ifstream myFile;
/*std::string test = "Hello";
int len = test.size();
char *charArray = &test[0];
int idx = stem(charArray, 0, len);
for(int i = 0; i < test.size(); ++i) {
std::cout << charArray[i];
} */
// Print to CLI if the user didn't pass in any documents to parse
if (argc <= 1) {
std::cout << "Usage: This program requires at least one document to be parsed.\n";
exit(0);
}
// Primary loop that loops through all the documents passed in via command
// line
for (int i = 1; i < argc; ++i) {
fileCollection.push_back(argv[i]);
myFile.open(argv[i]);
if (!myFile.is_open()) {
std::cout << "Unable to open file: " << argv[i] << std::endl;
std::cout << "Usage: " << argv[i] << " document1 [document2] ...\n";
exit(0);
}
std::string word;
std::string newWord;
std::vector<std::string> puncWords;
// Loop that pulls all words from a single file
while (myFile >> word) {
// Clear puncWords on new word
puncWords.clear();
// Call removePunc function on the word, return a vector of words
// that we will add into the postings list.
std::cout << "Calling removePunc from main.\n";
puncWords = removePunc(word);
std::cout << "Returned to main from removePunc\n";
// For each words that we've returned from removePunc, verify that
// the word isn't bigger than the biggest word. We are going to
// then see if our set of tokens already contains that word
for (unsigned int j = 0; j < puncWords.size(); ++j) {
newWord = puncWords.at(j);
std::set<token>::iterator c;
if (newWord.size() > biggestWord) {
biggestWord = word.size();
}
// Search through our set of tokens. If we find that the token
// already exists in the set, let's add this document number to
// that token's list of document numbers.
if(tokenSet.find(newWord) != tokenSet.end()) {
token temp = *c;
temp.addDocNum(i);
temp.updateUniSize();
tokenSet.erase(c);
tokenSet.insert(temp);
// Otherwise, create a new token because we've never seen this
// token before
} else {
token temp = token(newWord);
temp.addDocNum(i);
tokenSet.insert(temp);
} //end if..else
} //end (unsigned int... (process words, add to token set)
} //end while(myFile... (pull words from file)
myFile.close();
} //end for(int i... (read through documents)
// Print Legend
printFileCollection(fileCollection, biggestWord);
// For each token in the set of tokens, loop through and print the word
// and the postings list for that word
std::set<token>::iterator iter;
for(iter = tokenSet.begin(); iter != tokenSet.end(); ++iter) {
token t = *iter;
std::cout << t.sWord;
for (unsigned int i = t.sWord.size() -1; i < biggestWord; ++i) {
std::cout << " ";
}
printIntegerSet(t.uniDocList); // Print document numbers for that token
}
return 0;
}
// Function takes in a vector containing all of the file names and prints them
// to the console as the Legend
void printFileCollection (std::vector< std::string > fCol, int biggestWord) {
std::cout << "Legend:" << std::endl;
for(unsigned int i = 0; i < fCol.size(); ++i) {
std::cout << i + 1 << ". " << fCol.at(i) << std::endl;
}
// Everything that follows is some magical way of formatting the Legend
std::cout << "" << std::endl;
std::cout << "Word";
for(int i = 4; i < biggestWord; ++i) {
std::cout << " ";
}
std::cout << " Posting\n";
for(int i = 0; i < biggestWord; ++i) {
std::cout << "-";
}
std::cout << " ";
for(int i = 0; i < biggestWord; ++i) {
std::cout << "-";
}
std::cout << "" << std::endl;
}
// Function that takes in a word and removes all punctuation
std::vector<std::string> removePunc (std::string word) {
std::cout << "Entering removePunc\n";
std::string retString;
std::vector<std::string> retSVec;
std::cout << "word is : " << word << " (size: " << word.size() << ")" << std::endl;
//for(char c : word) {
for(unsigned int i = 0; i < word.size(); ++i) {
//std::cout << "c is: " << c << std::endl;
// If the letter is a puncuation
/*if(c == '!' || c == '?' || c == '-' || c == ',' || c == '\"' || c == ')' || c == '(' || c == ':'
|| c == '.' || c == '&' || c == '%' || c == '[' || c == ']' || c == ';' ) {*/
if(word[i] == '!' || word[i] == '?' || word[i] == '-' || word[i] == ',' || word [i] == '\"' || word[i] == ')'
|| word[i] == '(' || word[i] == ':' || word[i] == '.' || word [i] == '&' || word[i] == '%'
|| word[i] == '[' || word[i] == ']' || word[i] == ';' ) {
// If the letter following the puctuation isn't white space (i.e
// not the end of the word, maybe it's two words put together
//if (c+1 != ' ') {
if(word[i+1] != ' ') {
// If not working with a blank string at this point (this
// would only happen if we had two punctations in a row,
// (i.e something like test!!test2), let's save our previous
// word into the return vector and clear it to make way for
// the next word
if (retString.size() != 0) {
std::cout << "Pushing " << retString << " back\n";
retSVec.push_back(retString);
retString.clear();
} else {
// Do nothing, because retString had no data
}
// Else, the following spot was white space after punctuation, nothing more to do
} else {
//std::cout << "Did not push something back\n";
}
/*} else if (c == '\'') {
std::cout << "Entering \' land.\n";
if ( c-1 == 'n' && c+1 == 't') {
std::cout << "Entering n\'t land.\n";
retString.erase(retString.end(), retString.end());
std::cout << "retString after removal: " << retString << std::endl;
retSVec.push_back(retString);
retSVec.push_back("not");
return "test";
} */
} else {
//retString.push_back(c);
retString.push_back(word[i]);
} // end else
} //end for
// If we have some word leftover still in retString, that means there was
// still one word that needs to be returned to main to be added into the
// document collection
if (retString.size() > 0) {
std::cout << "Outside of for looping, pushing back " << retString << std::endl;
retSVec.push_back(retString);
std::cout << "Successfully pushed back: " << retString << std::endl;
}
std::cout << "Passed through retString.size() != 0\n";
std::cout << "Returning from removePunc with vector size of " << retSVec.size() << "\n\n";
return retSVec;
} // end removePunc
// Function to print the words stored currently in a vector of tokens
void printTokenVector(std::vector<token> tVector) {
for(unsigned int i = 0; i < tVector.size(); ++i) {
std::cout << tVector.at(i).sWord << std::endl;
}
std::cout << "\n";
}
// Function to take in a set of Integers (in this case, the list of documents
// a token was found in) and then print them to the screen.
void printIntegerSet(std::set<int> tSet) {
std::set<int>::iterator iter;
//std::cout << "size of (" << tSet.size() << ") ";
for(iter = tSet.begin(); iter != tSet.end(); ++iter) {
std::cout << *iter << " ";
}
std::cout << "" << std::endl;
}
// overloading the < operator that allows putting our tokens into a set
bool operator< (const token &left, const token &right) {
return left.sWord < right.sWord;
}