diff --git a/pom.xml b/pom.xml index eb2c9e8..dd3a4b5 100644 --- a/pom.xml +++ b/pom.xml @@ -97,7 +97,8 @@ - 8.2.1 + + 8.2.2 Partially Collapsed Parallel LDA diff --git a/src/main/java/cc/mallet/util/LDAUtils.java b/src/main/java/cc/mallet/util/LDAUtils.java index da29a95..80b9a3c 100644 --- a/src/main/java/cc/mallet/util/LDAUtils.java +++ b/src/main/java/cc/mallet/util/LDAUtils.java @@ -98,16 +98,28 @@ public static Pipe buildSerialPipe(String stoplistFile) { } public static Pipe buildSerialPipe(String stoplistFile, Alphabet dataAlphabet) { - return buildSerialPipe(stoplistFile, dataAlphabet, null); + return buildSerialPipe(stoplistFile, dataAlphabet, null, false); } - public static Pipe buildSerialPipe(String stoplistFile, Alphabet dataAlphabet, LabelAlphabet targetAlphabet) { + public static Pipe buildSerialPipe(String stoplistFile, Alphabet dataAlphabet, boolean raw) { + return buildSerialPipe(stoplistFile, dataAlphabet, null, raw); + } + + public static Pipe buildSerialPipe(String stoplistFile, Alphabet dataAlphabet, LabelAlphabet targetAlphabet, boolean raw) { int maxBufSize = 10000; - SimpleTokenizerLarge tokenizer = null; - if(stoplistFile==null) { - tokenizer = new SimpleTokenizerLarge(new HashSet(), maxBufSize); + Pipe tokenizer = null; + if(raw) { + if(stoplistFile==null) { + tokenizer = new RawTokenizer(new HashSet(), maxBufSize); + } else { + tokenizer = new RawTokenizer(new File(stoplistFile), maxBufSize); + } } else { - tokenizer = new SimpleTokenizerLarge(new File(stoplistFile), maxBufSize); + if(stoplistFile==null) { + tokenizer = new SimpleTokenizerLarge(new HashSet(), maxBufSize); + } else { + tokenizer = new SimpleTokenizerLarge(new File(stoplistFile), maxBufSize); + } } ArrayList pipes = new ArrayList(); @@ -130,7 +142,7 @@ public static Pipe buildSerialPipe(String stoplistFile, Alphabet dataAlphabet, L Target2Label ttl = new Target2Label (tAlphabet); - pipes.add(csl); + if(!raw) pipes.add(csl); pipes.add(tokenizer); pipes.add(sl2fs); pipes.add(ttl);