/******************************************************************************* * Trombone is a flexible text processing and analysis library used * primarily by Voyant Tools (voyant-tools.org). * * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell * * This file is part of Trombone. * * Trombone is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Trombone is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Trombone. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.voyanttools.trombone.model; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.voyanttools.trombone.storage.Storage; import org.voyanttools.trombone.util.FlexibleParameters; import com.thoughtworks.xstream.annotations.XStreamOmitField; /** * @author sgs * */ public class Corpus implements Iterable<IndexedDocument> { @XStreamOmitField private Storage storage; private CorpusMetadata corpusMetadata; List<IndexedDocument> documents = null; @XStreamOmitField Map<String, Integer> documentPositionsMap = null; public Corpus(Storage storage, CorpusMetadata corpusMetadata) { this.storage = storage; this.corpusMetadata = corpusMetadata; } private List<IndexedDocument> getDocumentsList() throws IOException { if (documents==null) { documentPositionsMap = new HashMap<String, Integer>(); documents = new ArrayList<IndexedDocument>(); for (String id : getDocumentIds()) { documentPositionsMap.put(id, documents.size()); documents.add(new IndexedDocument(storage, id)); } } return documents; } public IndexedDocument getDocument(String id) throws IOException { if (documentPositionsMap==null) {getDocumentsList();} // this builds the map return getDocument(documentPositionsMap.get(id)); } @Override public Iterator<IndexedDocument> iterator() { try { return getDocumentsList().iterator(); } catch (IOException e) { throw new RuntimeException("Unable to load corpus documents."); } } public int size() throws IOException { return getDocumentIds().size(); } public IndexedDocument getDocument(int docIndex) throws IOException { return getDocumentsList().get(docIndex); } public String getId() { return corpusMetadata.getId(); } public CorpusMetadata getCorpusMetadata() { return corpusMetadata; } public int getDocumentPosition(String corpusId) throws IOException { if (documentPositionsMap==null) {getDocumentsList();} // this builds the map return documentPositionsMap.get(corpusId); } public List<String> getDocumentIds() { return corpusMetadata.getDocumentIds(); } public int getTokensCount(TokenType tokenType) throws IOException { // TODO: this should probably be drawn from the corpus metadata instead return corpusMetadata.getTokensCount(tokenType); } public int[] getTokensCounts(TokenType tokenType) throws IOException { String id = getId()+"-"+tokenType.name().toLowerCase()+"TokenCounts"; List<String> countList = new ArrayList<String>(); if (tokenType==TokenType.lexical && storage.isStored(id, Storage.Location.object)==false) { cacheCommonDocumentValues(); } else if (storage.isStored(id, Storage.Location.object)==false) { for (IndexedDocument doc : this) { countList.add(String.valueOf(doc.getMetadata().getTokensCount((tokenType)))); } storage.storeStrings(countList, id, Storage.Location.object); } if (countList.isEmpty()) { countList = storage.retrieveStrings(id, Storage.Location.object); } int[] counts = new int[size()]; int i=0; for (String pos : countList) { counts[i++] = Integer.parseInt(pos); } return counts; } public int[] getLastTokenPositions(TokenType tokenType) throws IOException { String id = getId()+"-"+tokenType.name().toLowerCase()+"LastTokenPositions"; List<String> positionsList = new ArrayList<String>(); if (tokenType==TokenType.lexical && storage.isStored(id, Storage.Location.object)==false) { cacheCommonDocumentValues(); } else if (storage.isStored(id, Storage.Location.object)==false) { for (IndexedDocument doc : this) { positionsList.add(String.valueOf(doc.getMetadata().getLastTokenPositionIndex(tokenType))); } storage.storeStrings(positionsList, id, Storage.Location.object); } if (positionsList.isEmpty()) { positionsList = storage.retrieveStrings(id, Storage.Location.object); } int[] positions = new int[size()]; int i=0; for (String pos : positionsList) { positions[i++] = Integer.parseInt(pos); } return positions; } public float[] getTypesCountMeans(TokenType tokenType) throws IOException { String id = getId()+"-"+tokenType.name().toLowerCase()+"TypesCountMeans-1"; List<String> meansList = new ArrayList<String>(); if (tokenType==TokenType.lexical && storage.isStored(id, Storage.Location.object)==false) { cacheCommonDocumentValues(); } else if (storage.isStored(id, Storage.Location.object)==false) { for (IndexedDocument doc : this) { meansList.add(String.valueOf(doc.getMetadata().getTypesCountMean(tokenType))); } storage.storeStrings(meansList, id, Storage.Location.object); } if (meansList.isEmpty()) { meansList = storage.retrieveStrings(id, Storage.Location.object); } float[] means = new float[size()]; int i=0; for (String pos : meansList) { means[i++] = Float.parseFloat(pos); } return means; } public float[] getTypesCountStdDevs(TokenType tokenType) throws IOException { String id = getId()+"-"+tokenType.name().toLowerCase()+"TypesCountStdDevs-1"; List<String> stdDevsList = new ArrayList<String>(); if (tokenType==TokenType.lexical && storage.isStored(id, Storage.Location.object)==false) { cacheCommonDocumentValues(); } else if (storage.isStored(id, Storage.Location.object)==false) { for (IndexedDocument doc : this) { stdDevsList.add(String.valueOf(doc.getMetadata().getTypesCountStdDev(tokenType))); } storage.storeStrings(stdDevsList, id, Storage.Location.object); } if (stdDevsList.isEmpty()) { stdDevsList = storage.retrieveStrings(id, Storage.Location.object); } float[] stdDevs = new float[size()]; int i=0; for (String pos : stdDevsList) { stdDevs[i++] = Float.parseFloat(pos); } return stdDevs; } public Collection<String> getLanguageCodes() throws IOException { if (storage.isStored(getId()+"-langs", Storage.Location.object)==false) { cacheCommonDocumentValues(); } return storage.retrieveStrings(getId()+"-langs", Storage.Location.object); } /** * This is to help ensure that we don't load each document metadata individually, which takes time. * @throws IOException */ private void cacheCommonDocumentValues() throws IOException { Set<String> langs = new HashSet<String>(); List<String> tokenCounts = new ArrayList<String>(); List<String> lastTokens = new ArrayList<String>(); List<String> typesCountMeans = new ArrayList<String>(); List<String> typesCountStdDev = new ArrayList<String>(); DocumentMetadata metadata; for (IndexedDocument doc : this) { metadata = doc.getMetadata(); String lang = metadata.getLanguageCode(); if (lang!=null && lang.isEmpty()==false) langs.add(lang); tokenCounts.add(String.valueOf(metadata.getTokensCount(TokenType.lexical))); lastTokens.add(String.valueOf(metadata.getLastTokenPositionIndex(TokenType.lexical))); typesCountMeans.add(Float.toString(metadata.getTypesCountMean(TokenType.lexical))); typesCountStdDev.add(Float.toString(metadata.getTypesCountStdDev(TokenType.lexical))); } if (langs.isEmpty()) {langs.add("??");} if (storage.isStored(this.getId()+"-langs", Storage.Location.object)==false) { storage.storeStrings(langs, this.getId()+"-langs", Storage.Location.object); } if (storage.isStored(this.getId()+"-lexicalTokenCounts", Storage.Location.object)==false) { storage.storeStrings(tokenCounts, this.getId()+"-lexicalTokenCounts", Storage.Location.object); } if (storage.isStored(this.getId()+"-lexicalLastTokenPositions", Storage.Location.object)==false) { storage.storeStrings(lastTokens, this.getId()+"-lexicalLastTokenPositions", Storage.Location.object); } if (storage.isStored(this.getId()+"-lexicalTypesCountMeans-1", Storage.Location.object)==false) { storage.storeStrings(typesCountMeans, this.getId()+"-lexicalTypesCountMeans-1", Storage.Location.object); } if (storage.isStored(this.getId()+"-lexicalTypesCountStdDevs-1", Storage.Location.object)==false) { storage.storeStrings(typesCountStdDev, this.getId()+"-lexicalTypesCountStdDevs-1", Storage.Location.object); } } public CorpusAccess getValidatedCorpusAccess(FlexibleParameters parameters) throws CorpusAccessException { String password = parameters.getParameterValue("password", ""); for (CorpusAccess mode : new CorpusAccess[]{CorpusAccess.ADMIN, CorpusAccess.ACCESS}) { String[] passwords = corpusMetadata.getAccessPasswords(mode); if (passwords.length>0) { for (String pass : passwords) { if (pass.isEmpty()==false && pass.equals(password)) {return mode;} } // if we have defined passwords for full and no matches, we raise error if (mode==CorpusAccess.ACCESS) { CorpusAccess noPasswordAccess = corpusMetadata.getNoPasswordAccess(); if (noPasswordAccess==CorpusAccess.ACCESS) { throw new CorpusAccessException("Access to this tool requires a valid password."); } else if (noPasswordAccess==CorpusAccess.NONCONSUMPTIVE) { return CorpusAccess.NONCONSUMPTIVE; } } } } return CorpusAccess.NORMAL; } @Override public boolean equals(Object obj) { if (!(obj instanceof Corpus)) {return false;} Corpus corpusObj = (Corpus) obj; List<String> corpusObjIds = corpusObj.getDocumentIds(); List<String> ids = getDocumentIds(); if (corpusObjIds.size()!=ids.size()) {return false;} for (int i=0; i<ids.size(); i++) { if (ids.get(i).equals(corpusObjIds.get(i))==false) {return false;} } return true; } }