Corpus.java example

Explorer

trombone-master
- src
  - main
    - java
  - test
    - java
      - org
        voyanttools
        trombone
        document
        MetadataTest.java
        input
        expand
        ArchiveExpanderTest.java
        CompressedExpanderTest.java
        XmlExpanderTest.java
        XslExpanderTest.java
        extract
        BagItExtractorTest.java
        TikaExtractorTest.java
        XmlExtractorTest.java
        index
        LuceneIndexerTest.java
        lucene
        StoredToLuceneDocumentMapperTest.java
        analysis
        OpenNlpLemmaTokenizerTest.java
        StanfordNlpLemmaTokenizerTest.java
        search
        FieldPrefixAwareSimpleQueryParserTest.java
        model
        CorpusCollocateTest.java
        CorpusTermMinimalsDBTest.java
        CorpusTermsQueueTest.java
        DocumentTermsTest.java
        KeywordsTest.java
        TableTest.java
        storage
        file
        FileStoredDocumentSourceStorageTest.java
        TromboneMigration.java
        tool
        DocumentCollocatesTest.java
        DocumentTermsTest.java
        KwicsTest.java
        StoredResourceTest.java
        TableCorrelationsTest.java
        TableManagerTest.java
        build
        CorpusBuilderTest.java
        CorpusCreatorTest.java
        DocumentExpanderTest.java
        DocumentExtractorTest.java
        DocumentStorerTest.java
        corpus
        CorpusCollocatesTest.java
        CorpusFacetsTest.java
        CorpusManagerTest.java
        CorpusTermsTest.java
        DocumentContextsTest.java
        DocumentNgramsTest.java
        DocumentTermsTest.java
        DocumentTokensTest.java
        DocumentsMetadataTest.java
        SimpleSortedSetFacetsExample.java
        VelizaTest.java
        util
        EmbeddedWebServer.java
        TestHelper.java

/*******************************************************************************
 * Trombone is a flexible text processing and analysis library used
 * primarily by Voyant Tools (voyant-tools.org).
 * 
 * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell
 * 
 * This file is part of Trombone.
 * 
 * Trombone is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Trombone is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Trombone.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package org.voyanttools.trombone.model;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.voyanttools.trombone.storage.Storage;
import org.voyanttools.trombone.util.FlexibleParameters;

import com.thoughtworks.xstream.annotations.XStreamOmitField;

/**
 * @author sgs
 *
 */
public class Corpus implements Iterable<IndexedDocument> {

	@XStreamOmitField
	private Storage storage;

	private CorpusMetadata corpusMetadata;
	
	List<IndexedDocument> documents = null;
	
	@XStreamOmitField
	Map<String, Integer> documentPositionsMap = null;
	
	
	public Corpus(Storage storage, CorpusMetadata corpusMetadata) {
		this.storage = storage;
		this.corpusMetadata = corpusMetadata;
	}

	private List<IndexedDocument> getDocumentsList() throws IOException {
		if (documents==null) {
			documentPositionsMap = new HashMap<String, Integer>();
			documents = new ArrayList<IndexedDocument>();
			for (String id : getDocumentIds()) {
				documentPositionsMap.put(id, documents.size());
				documents.add(new IndexedDocument(storage, id));
			}
		}
		return documents;
	}

	public IndexedDocument getDocument(String id) throws IOException {
		if (documentPositionsMap==null) {getDocumentsList();} // this builds the map
		return getDocument(documentPositionsMap.get(id));
	}

	@Override
	public Iterator<IndexedDocument> iterator() {
		try {
			return getDocumentsList().iterator();
		} catch (IOException e) {
			throw new RuntimeException("Unable to load corpus documents.");
		}
	}

	public int size() throws IOException {
		return getDocumentIds().size();
	}

	public IndexedDocument getDocument(int docIndex) throws IOException {
		return getDocumentsList().get(docIndex);
	}


	public String getId() {
		return corpusMetadata.getId();
	}

	public CorpusMetadata getCorpusMetadata() {
		return corpusMetadata;
	}

	public int getDocumentPosition(String corpusId) throws IOException {
		if (documentPositionsMap==null) {getDocumentsList();} // this builds the map
		return documentPositionsMap.get(corpusId);
	}

	public List<String> getDocumentIds() {
		return corpusMetadata.getDocumentIds();
	}
	
	public int getTokensCount(TokenType tokenType) throws IOException {
		// TODO: this should probably be drawn from the corpus metadata instead
		return corpusMetadata.getTokensCount(tokenType);
	}
	
	public int[] getTokensCounts(TokenType tokenType) throws IOException {
		String id = getId()+"-"+tokenType.name().toLowerCase()+"TokenCounts";
		List<String> countList = new ArrayList<String>();
		if (tokenType==TokenType.lexical && storage.isStored(id, Storage.Location.object)==false) {
			cacheCommonDocumentValues();
		} else if (storage.isStored(id, Storage.Location.object)==false) {
			for (IndexedDocument doc : this) {
				countList.add(String.valueOf(doc.getMetadata().getTokensCount((tokenType))));
			}
			storage.storeStrings(countList, id, Storage.Location.object);
		}
		if (countList.isEmpty()) {
			countList = storage.retrieveStrings(id, Storage.Location.object);
		}
		int[] counts = new int[size()];
		int i=0;
		for (String pos : countList) {
			counts[i++] = Integer.parseInt(pos);
		}
		return counts;
	}	
	
	public int[] getLastTokenPositions(TokenType tokenType) throws IOException {
		String id = getId()+"-"+tokenType.name().toLowerCase()+"LastTokenPositions";
		List<String> positionsList = new ArrayList<String>();
		if (tokenType==TokenType.lexical && storage.isStored(id, Storage.Location.object)==false) {
			cacheCommonDocumentValues();
		} else if (storage.isStored(id, Storage.Location.object)==false) {
			for (IndexedDocument doc : this) {
				positionsList.add(String.valueOf(doc.getMetadata().getLastTokenPositionIndex(tokenType)));
			}
			storage.storeStrings(positionsList, id, Storage.Location.object);
		}
		if (positionsList.isEmpty()) {
			positionsList = storage.retrieveStrings(id, Storage.Location.object);
		}
		int[] positions = new int[size()];
		int i=0;
		for (String pos : positionsList) {
			positions[i++] = Integer.parseInt(pos);
		}
		return positions;
	}

	public float[] getTypesCountMeans(TokenType tokenType) throws IOException {
		String id = getId()+"-"+tokenType.name().toLowerCase()+"TypesCountMeans-1";
		List<String> meansList = new ArrayList<String>();
		if (tokenType==TokenType.lexical && storage.isStored(id, Storage.Location.object)==false) {
			cacheCommonDocumentValues();
		} else if (storage.isStored(id, Storage.Location.object)==false) {
			for (IndexedDocument doc : this) {
				meansList.add(String.valueOf(doc.getMetadata().getTypesCountMean(tokenType)));
			}
			storage.storeStrings(meansList, id, Storage.Location.object);
		}
		if (meansList.isEmpty()) {
			meansList = storage.retrieveStrings(id, Storage.Location.object);
		}
		float[] means = new float[size()];
		int i=0;
		for (String pos : meansList) {
			means[i++] = Float.parseFloat(pos);
		}
		return means;
	}

	public float[] getTypesCountStdDevs(TokenType tokenType) throws IOException {
		String id = getId()+"-"+tokenType.name().toLowerCase()+"TypesCountStdDevs-1";
		List<String> stdDevsList = new ArrayList<String>();
		if (tokenType==TokenType.lexical && storage.isStored(id, Storage.Location.object)==false) {
			cacheCommonDocumentValues();
		} else if (storage.isStored(id, Storage.Location.object)==false) {
			for (IndexedDocument doc : this) {
				stdDevsList.add(String.valueOf(doc.getMetadata().getTypesCountStdDev(tokenType)));
			}
			storage.storeStrings(stdDevsList, id, Storage.Location.object);
		}
		if (stdDevsList.isEmpty()) {
			stdDevsList = storage.retrieveStrings(id, Storage.Location.object);
		}
		float[] stdDevs = new float[size()];
		int i=0;
		for (String pos : stdDevsList) {
			stdDevs[i++] = Float.parseFloat(pos);
		}
		return stdDevs;
	}

	public Collection<String> getLanguageCodes() throws IOException {
		if (storage.isStored(getId()+"-langs", Storage.Location.object)==false) {
			cacheCommonDocumentValues();
		}
		return storage.retrieveStrings(getId()+"-langs", Storage.Location.object);
	}
	
	/**
	 * This is to help ensure that we don't load each document metadata individually, which takes time.
	 * @throws IOException
	 */
	private void cacheCommonDocumentValues() throws IOException {
		Set<String> langs = new HashSet<String>();
		List<String> tokenCounts = new ArrayList<String>();
		List<String> lastTokens = new ArrayList<String>();
		List<String> typesCountMeans = new ArrayList<String>();
		List<String> typesCountStdDev = new ArrayList<String>();
		DocumentMetadata metadata;
		for (IndexedDocument doc : this) {
			metadata = doc.getMetadata();
			String lang = metadata.getLanguageCode();
			if (lang!=null && lang.isEmpty()==false) langs.add(lang);
			tokenCounts.add(String.valueOf(metadata.getTokensCount(TokenType.lexical)));
			lastTokens.add(String.valueOf(metadata.getLastTokenPositionIndex(TokenType.lexical)));
			typesCountMeans.add(Float.toString(metadata.getTypesCountMean(TokenType.lexical)));
			typesCountStdDev.add(Float.toString(metadata.getTypesCountStdDev(TokenType.lexical)));
		}
		if (langs.isEmpty()) {langs.add("??");}
		if (storage.isStored(this.getId()+"-langs", Storage.Location.object)==false) {
			storage.storeStrings(langs, this.getId()+"-langs", Storage.Location.object);
		}
		if (storage.isStored(this.getId()+"-lexicalTokenCounts", Storage.Location.object)==false) {
			storage.storeStrings(tokenCounts, this.getId()+"-lexicalTokenCounts", Storage.Location.object);
		}
		if (storage.isStored(this.getId()+"-lexicalLastTokenPositions", Storage.Location.object)==false) {
			storage.storeStrings(lastTokens, this.getId()+"-lexicalLastTokenPositions", Storage.Location.object);
		}
		if (storage.isStored(this.getId()+"-lexicalTypesCountMeans-1", Storage.Location.object)==false) {
			storage.storeStrings(typesCountMeans, this.getId()+"-lexicalTypesCountMeans-1", Storage.Location.object);
		}
		if (storage.isStored(this.getId()+"-lexicalTypesCountStdDevs-1", Storage.Location.object)==false) {
			storage.storeStrings(typesCountStdDev, this.getId()+"-lexicalTypesCountStdDevs-1", Storage.Location.object);
		}
	}

	public CorpusAccess getValidatedCorpusAccess(FlexibleParameters parameters) throws CorpusAccessException {

		String password = parameters.getParameterValue("password", "");
		for (CorpusAccess mode : new CorpusAccess[]{CorpusAccess.ADMIN, CorpusAccess.ACCESS}) {
			String[] passwords = corpusMetadata.getAccessPasswords(mode);
			if (passwords.length>0) {
				for (String pass : passwords) {
					if (pass.isEmpty()==false && pass.equals(password)) {return mode;}
				}
				
				// if we have defined passwords for full and no matches, we raise error
				if (mode==CorpusAccess.ACCESS) {
					CorpusAccess noPasswordAccess = corpusMetadata.getNoPasswordAccess();
					if (noPasswordAccess==CorpusAccess.ACCESS) {
						throw new CorpusAccessException("Access to this tool requires a valid password.");
					}
					else if (noPasswordAccess==CorpusAccess.NONCONSUMPTIVE) {
						return CorpusAccess.NONCONSUMPTIVE;
					}
				}
			}
		}

		return CorpusAccess.NORMAL;
	}
	
	@Override
	public boolean equals(Object obj) {
		if (!(obj instanceof Corpus)) {return false;}
		Corpus corpusObj = (Corpus) obj;
		List<String> corpusObjIds = corpusObj.getDocumentIds();
		List<String> ids = getDocumentIds();
		if (corpusObjIds.size()!=ids.size()) {return false;}
		for (int i=0; i<ids.size(); i++) {
			if (ids.get(i).equals(corpusObjIds.get(i))==false) {return false;}
		}
		return true;
	}

}