Skip to content

Commit

Permalink
Support raw imoprt mode when reading from string
Browse files Browse the repository at this point in the history
  • Loading branch information
lejon committed Dec 20, 2019
1 parent 30b51b1 commit 951e2bf
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 8 deletions.
3 changes: 2 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@
<!-- v8.1.0: Added Polya Urn with seeding (fixed small bug in Spalias with prior)-->
<!-- v8.1.1: Added 'raw' mode, i.e no pre-processing of input data-->
<!-- v8.2.1: Added 'continue sampling' and 'init from' functionality -->
<version>8.2.1</version>
<!-- v8.2.2: Support 'raw' mode when reading corpus from string -->
<version>8.2.2</version>

<name>Partially Collapsed Parallel LDA</name>

Expand Down
26 changes: 19 additions & 7 deletions src/main/java/cc/mallet/util/LDAUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -98,16 +98,28 @@ public static Pipe buildSerialPipe(String stoplistFile) {
}

public static Pipe buildSerialPipe(String stoplistFile, Alphabet dataAlphabet) {
return buildSerialPipe(stoplistFile, dataAlphabet, null);
return buildSerialPipe(stoplistFile, dataAlphabet, null, false);
}

public static Pipe buildSerialPipe(String stoplistFile, Alphabet dataAlphabet, LabelAlphabet targetAlphabet) {
public static Pipe buildSerialPipe(String stoplistFile, Alphabet dataAlphabet, boolean raw) {
return buildSerialPipe(stoplistFile, dataAlphabet, null, raw);
}

public static Pipe buildSerialPipe(String stoplistFile, Alphabet dataAlphabet, LabelAlphabet targetAlphabet, boolean raw) {
int maxBufSize = 10000;
SimpleTokenizerLarge tokenizer = null;
if(stoplistFile==null) {
tokenizer = new SimpleTokenizerLarge(new HashSet<String>(), maxBufSize);
Pipe tokenizer = null;
if(raw) {
if(stoplistFile==null) {
tokenizer = new RawTokenizer(new HashSet<String>(), maxBufSize);
} else {
tokenizer = new RawTokenizer(new File(stoplistFile), maxBufSize);
}
} else {
tokenizer = new SimpleTokenizerLarge(new File(stoplistFile), maxBufSize);
if(stoplistFile==null) {
tokenizer = new SimpleTokenizerLarge(new HashSet<String>(), maxBufSize);
} else {
tokenizer = new SimpleTokenizerLarge(new File(stoplistFile), maxBufSize);
}
}

ArrayList<Pipe> pipes = new ArrayList<Pipe>();
Expand All @@ -130,7 +142,7 @@ public static Pipe buildSerialPipe(String stoplistFile, Alphabet dataAlphabet, L

Target2Label ttl = new Target2Label (tAlphabet);

pipes.add(csl);
if(!raw) pipes.add(csl);
pipes.add(tokenizer);
pipes.add(sl2fs);
pipes.add(ttl);
Expand Down

0 comments on commit 951e2bf

Please sign in to comment.