package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.index.Payload;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.analysis.tokenattributes.*;
/** This class tests some special cases of backwards compatibility when using the new TokenStream API with old analyzers */
public class TestTokenStreamBWComp extends LuceneTestCase {
private static final String doc = "This is the new TokenStream api";
private static final String[] stopwords = new String[] {"is", "the", "this"};
private static final String[] results = new String[] {"new", "tokenstream", "api"};
public static class POSToken extends Token {
public static final int PROPERNOUN = 1;
public static final int NO_NOUN = 2;
private int partOfSpeech;
public void setPartOfSpeech(int pos) {
partOfSpeech = pos;
}
public int getPartOfSpeech() {
return this.partOfSpeech;
}
}
static class PartOfSpeechTaggingFilter extends TokenFilter {
protected PartOfSpeechTaggingFilter(TokenStream input) {
super(input);
}
public Token next() throws IOException {
Token t = input.next();
if (t == null) return null;
POSToken pt = new POSToken();
pt.reinit(t);
if (pt.termLength() > 0) {
if (Character.isUpperCase(pt.termBuffer()[0])) {
pt.setPartOfSpeech(POSToken.PROPERNOUN);
} else {
pt.setPartOfSpeech(POSToken.NO_NOUN);
}
}
return pt;
}
}
static class PartOfSpeechAnnotatingFilter extends TokenFilter {
public final static byte PROPER_NOUN_ANNOTATION = 1;
protected PartOfSpeechAnnotatingFilter(TokenStream input) {
super(input);
}
public Token next() throws IOException {
Token t = input.next();
if (t == null) return null;
if (t instanceof POSToken) {
POSToken pt = (POSToken) t;
if (pt.getPartOfSpeech() == POSToken.PROPERNOUN) {
pt.setPayload(new Payload(new byte[] {PROPER_NOUN_ANNOTATION}));
}
return pt;
} else {
return t;
}
}
}
// test the chain: The one and only term "TokenStream" should be declared as proper noun:
public void testTeeSinkCustomTokenNewAPI() throws IOException {
testTeeSinkCustomToken(0);
}
public void testTeeSinkCustomTokenOldAPI() throws IOException {
testTeeSinkCustomToken(1);
}
public void testTeeSinkCustomTokenVeryOldAPI() throws IOException {
testTeeSinkCustomToken(2);
}
private void testTeeSinkCustomToken(int api) throws IOException {
TokenStream stream = new WhitespaceTokenizer(new StringReader(doc));
stream = new PartOfSpeechTaggingFilter(stream);
stream = new LowerCaseFilter(stream);
stream = new StopFilter(stream, stopwords);
SinkTokenizer sink = new SinkTokenizer();
TokenStream stream1 = new PartOfSpeechAnnotatingFilter(sink);
stream = new TeeTokenFilter(stream, sink);
stream = new PartOfSpeechAnnotatingFilter(stream);
switch (api) {
case 0:
consumeStreamNewAPI(stream);
consumeStreamNewAPI(stream1);
break;
case 1:
consumeStreamOldAPI(stream);
consumeStreamOldAPI(stream1);
break;
case 2:
consumeStreamVeryOldAPI(stream);
consumeStreamVeryOldAPI(stream1);
break;
}
}
// test caching the special custom POSToken works in all cases
public void testCachingCustomTokenNewAPI() throws IOException {
testTeeSinkCustomToken(0);
}
public void testCachingCustomTokenOldAPI() throws IOException {
testTeeSinkCustomToken(1);
}
public void testCachingCustomTokenVeryOldAPI() throws IOException {
testTeeSinkCustomToken(2);
}
public void testCachingCustomTokenMixed() throws IOException {
testTeeSinkCustomToken(3);
}
private void testCachingCustomToken(int api) throws IOException {
TokenStream stream = new WhitespaceTokenizer(new StringReader(doc));
stream = new PartOfSpeechTaggingFilter(stream);
stream = new LowerCaseFilter(stream);
stream = new StopFilter(stream, stopwords);
stream = new CachingTokenFilter(stream); // <- the caching is done before the annotating!
stream = new PartOfSpeechAnnotatingFilter(stream);
switch (api) {
case 0:
consumeStreamNewAPI(stream);
consumeStreamNewAPI(stream);
break;
case 1:
consumeStreamOldAPI(stream);
consumeStreamOldAPI(stream);
break;
case 2:
consumeStreamVeryOldAPI(stream);
consumeStreamVeryOldAPI(stream);
break;
case 3:
consumeStreamNewAPI(stream);
consumeStreamOldAPI(stream);
consumeStreamVeryOldAPI(stream);
consumeStreamNewAPI(stream);
consumeStreamVeryOldAPI(stream);
break;
}
}
private static void consumeStreamNewAPI(TokenStream stream) throws IOException {
stream.reset();
PayloadAttribute payloadAtt = (PayloadAttribute) stream.addAttribute(PayloadAttribute.class);
TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
int i=0;
while (stream.incrementToken()) {
String term = termAtt.term();
Payload p = payloadAtt.getPayload();
if (p != null && p.getData().length == 1 && p.getData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) {
assertEquals("only TokenStream is a proper noun", "tokenstream", term);
} else {
assertFalse("all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)", "tokenstream".equals(term));
}
assertEquals(results[i], term);
i++;
}
}
private static void consumeStreamOldAPI(TokenStream stream) throws IOException {
stream.reset();
Token reusableToken = new Token();
int i=0;
while ((reusableToken = stream.next(reusableToken)) != null) {
String term = reusableToken.term();
Payload p = reusableToken.getPayload();
if (p != null && p.getData().length == 1 && p.getData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) {
assertEquals("only TokenStream is a proper noun", "tokenstream", term);
} else {
assertFalse("all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)", "tokenstream".equals(term));
}
assertEquals(results[i], term);
i++;
}
}
private static void consumeStreamVeryOldAPI(TokenStream stream) throws IOException {
stream.reset();
Token token;
int i=0;
while ((token = stream.next()) != null) {
String term = token.term();
Payload p = token.getPayload();
if (p != null && p.getData().length == 1 && p.getData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) {
assertEquals("only TokenStream is a proper noun", "tokenstream", term);
} else {
assertFalse("all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)", "tokenstream".equals(term));
}
assertEquals(results[i], term);
i++;
}
}
public interface SenselessAttribute extends Attribute {}
public static final class SenselessAttributeImpl extends AttributeImpl implements SenselessAttribute {
public void copyTo(AttributeImpl target) {}
public void clear() {}
public boolean equals(Object o) { return (o instanceof SenselessAttributeImpl); }
public int hashCode() { return 0; }
}
// test if tokenization fails, if only the new API is allowed and an old TokenStream is in the chain
public void testOnlyNewAPI() throws IOException {
TokenStream.setOnlyUseNewAPI(true);
try {
// this should fail with UOE
try {
TokenStream stream = new WhitespaceTokenizer(new StringReader(doc));
stream = new PartOfSpeechTaggingFilter(stream); // <-- this one is evil!
stream = new LowerCaseFilter(stream);
stream = new StopFilter(stream, stopwords);
while (stream.incrementToken());
fail("If only the new API is allowed, this should fail with an UOE");
} catch (UnsupportedOperationException uoe) {
assertEquals((PartOfSpeechTaggingFilter.class.getName()+" does not implement incrementToken() which is needed for onlyUseNewAPI."),uoe.getMessage());
}
// this should pass, as all core token streams support the new API
TokenStream stream = new WhitespaceTokenizer(new StringReader(doc));
stream = new LowerCaseFilter(stream);
stream = new StopFilter(stream, stopwords);
while (stream.incrementToken());
// Test, if all attributes are implemented by their implementation, not Token/TokenWrapper
assertTrue("TermAttribute is not implemented by TermAttributeImpl",
stream.addAttribute(TermAttribute.class) instanceof TermAttributeImpl);
assertTrue("OffsetAttribute is not implemented by OffsetAttributeImpl",
stream.addAttribute(OffsetAttribute.class) instanceof OffsetAttributeImpl);
assertTrue("FlagsAttribute is not implemented by FlagsAttributeImpl",
stream.addAttribute(FlagsAttribute.class) instanceof FlagsAttributeImpl);
assertTrue("PayloadAttribute is not implemented by PayloadAttributeImpl",
stream.addAttribute(PayloadAttribute.class) instanceof PayloadAttributeImpl);
assertTrue("PositionIncrementAttribute is not implemented by PositionIncrementAttributeImpl",
stream.addAttribute(PositionIncrementAttribute.class) instanceof PositionIncrementAttributeImpl);
assertTrue("TypeAttribute is not implemented by TypeAttributeImpl",
stream.addAttribute(TypeAttribute.class) instanceof TypeAttributeImpl);
assertTrue("SenselessAttribute is not implemented by SenselessAttributeImpl",
stream.addAttribute(SenselessAttribute.class) instanceof SenselessAttributeImpl);
// try to call old API, this should fail
try {
stream.reset();
Token reusableToken = new Token();
while ((reusableToken = stream.next(reusableToken)) != null);
fail("If only the new API is allowed, this should fail with an UOE");
} catch (UnsupportedOperationException uoe) {
assertEquals("This TokenStream only supports the new Attributes API.", uoe.getMessage());
}
try {
stream.reset();
while (stream.next() != null);
fail("If only the new API is allowed, this should fail with an UOE");
} catch (UnsupportedOperationException uoe) {
assertEquals("This TokenStream only supports the new Attributes API.", uoe.getMessage());
}
// Test if the wrapper API (onlyUseNewAPI==false) uses TokenWrapper
// as attribute instance.
// TokenWrapper encapsulates a Token instance that can be exchanged
// by another Token instance without changing the AttributeImpl instance
// itsself.
TokenStream.setOnlyUseNewAPI(false);
stream = new WhitespaceTokenizer(new StringReader(doc));
assertTrue("TermAttribute is not implemented by TokenWrapper",
stream.addAttribute(TermAttribute.class) instanceof TokenWrapper);
assertTrue("OffsetAttribute is not implemented by TokenWrapper",
stream.addAttribute(OffsetAttribute.class) instanceof TokenWrapper);
assertTrue("FlagsAttribute is not implemented by TokenWrapper",
stream.addAttribute(FlagsAttribute.class) instanceof TokenWrapper);
assertTrue("PayloadAttribute is not implemented by TokenWrapper",
stream.addAttribute(PayloadAttribute.class) instanceof TokenWrapper);
assertTrue("PositionIncrementAttribute is not implemented by TokenWrapper",
stream.addAttribute(PositionIncrementAttribute.class) instanceof TokenWrapper);
assertTrue("TypeAttribute is not implemented by TokenWrapper",
stream.addAttribute(TypeAttribute.class) instanceof TokenWrapper);
// This one is not implemented by TokenWrapper:
assertTrue("SenselessAttribute is not implemented by SenselessAttributeImpl",
stream.addAttribute(SenselessAttribute.class) instanceof SenselessAttributeImpl);
} finally {
TokenStream.setOnlyUseNewAPI(false);
}
}
public void testOverridesAny() throws Exception {
try {
TokenStream stream = new WhitespaceTokenizer(new StringReader(doc));
stream = new TokenFilter(stream) {
// we implement nothing, only un-abstract it
};
stream = new LowerCaseFilter(stream);
stream = new StopFilter(stream, stopwords);
while (stream.incrementToken());
fail("One TokenFilter does not override any of the required methods, so it should fail.");
} catch (UnsupportedOperationException uoe) {
assertTrue("invalid UOE message", uoe.getMessage().endsWith("does not implement any of incrementToken(), next(Token), next()."));
}
}
public void testMixedOldApiConsumer() throws Exception {
// WhitespaceTokenizer is using incrementToken() API:
TokenStream stream = new WhitespaceTokenizer(new StringReader("foo bar moo maeh"));
Token foo = new Token();
foo = stream.next(foo);
Token bar = stream.next();
assertEquals("foo", foo.term());
assertEquals("bar", bar.term());
Token moo = stream.next(foo);
assertEquals("moo", moo.term());
assertEquals("private 'bar' term should still be valid", "bar", bar.term());
// and now we also use incrementToken()... (very bad, but should work)
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
assertTrue(stream.incrementToken());
assertEquals("maeh", termAtt.term());
assertEquals("private 'bar' term should still be valid", "bar", bar.term());
}
/*
* old api that cycles thru foo, bar, meh
*/
private class RoundRobinOldAPI extends TokenStream {
int count = 0;
String terms[] = { "foo", "bar", "meh" };
public Token next(Token reusableToken) throws IOException {
reusableToken.setTermBuffer(terms[count % terms.length]);
count++;
return reusableToken;
}
}
public void testMixedOldApiConsumer2() throws Exception {
// RoundRobinOldAPI is using TokenStream(next)
TokenStream stream = new RoundRobinOldAPI();
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
assertTrue(stream.incrementToken());
Token bar = stream.next();
assertEquals("foo", termAtt.term());
assertEquals("bar", bar.term());
assertTrue(stream.incrementToken());
assertEquals("meh", termAtt.term());
assertEquals("private 'bar' term should still be valid", "bar", bar.term());
Token foo = stream.next();
assertEquals("the term attribute should still be the same", "meh", termAtt.term());
assertEquals("foo", foo.term());
assertEquals("private 'bar' term should still be valid", "bar", bar.term());
}
}