package de.danielbasedow.prospecter.core.analysis;
import de.danielbasedow.prospecter.core.Token;
import java.util.List;
/**
* Interface for Analyzers. Analyzers turn raw text into a list of tokens.
*/
public interface Analyzer {
/**
* Tokenizes raw input
*
* @param input raw String that should be turned into tokens
* @return list of tokens
* @throws TokenizerException
*/
public List<Token> tokenize(String input) throws TokenizerException;
/**
* Tokenizes raw input. It is possible to turn off generating formerly unknown tokens. This makes sense when
* tokenizing documents, as any token in a document has to have been already seen in a query to have any chance
* of matching.
*
* @param input raw String that should be turned into tokens
* @param dontGenerateNewIds if set to true no new tokens will be generated.
* @return list of tokens
* @throws TokenizerException
*/
public List<Token> tokenize(String input, boolean dontGenerateNewIds) throws TokenizerException;
}