DocumentFormat.java example

Explorer

trombone-master
- src
  - main
    - java
  - test
    - java
      - org
        voyanttools
        trombone
        document
        MetadataTest.java
        input
        expand
        ArchiveExpanderTest.java
        CompressedExpanderTest.java
        XmlExpanderTest.java
        XslExpanderTest.java
        extract
        BagItExtractorTest.java
        TikaExtractorTest.java
        XmlExtractorTest.java
        index
        LuceneIndexerTest.java
        lucene
        StoredToLuceneDocumentMapperTest.java
        analysis
        OpenNlpLemmaTokenizerTest.java
        StanfordNlpLemmaTokenizerTest.java
        search
        FieldPrefixAwareSimpleQueryParserTest.java
        model
        CorpusCollocateTest.java
        CorpusTermMinimalsDBTest.java
        CorpusTermsQueueTest.java
        DocumentTermsTest.java
        KeywordsTest.java
        TableTest.java
        storage
        file
        FileStoredDocumentSourceStorageTest.java
        TromboneMigration.java
        tool
        DocumentCollocatesTest.java
        DocumentTermsTest.java
        KwicsTest.java
        StoredResourceTest.java
        TableCorrelationsTest.java
        TableManagerTest.java
        build
        CorpusBuilderTest.java
        CorpusCreatorTest.java
        DocumentExpanderTest.java
        DocumentExtractorTest.java
        DocumentStorerTest.java
        corpus
        CorpusCollocatesTest.java
        CorpusFacetsTest.java
        CorpusManagerTest.java
        CorpusTermsTest.java
        DocumentContextsTest.java
        DocumentNgramsTest.java
        DocumentTermsTest.java
        DocumentTokensTest.java
        DocumentsMetadataTest.java
        SimpleSortedSetFacetsExample.java
        VelizaTest.java
        util
        EmbeddedWebServer.java
        TestHelper.java

/*******************************************************************************
 * Trombone is a flexible text processing and analysis library used
 * primarily by Voyant Tools (voyant-tools.org).
 * 
 * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell
 * 
 * This file is part of Trombone.
 * 
 * Trombone is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Trombone is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Trombone.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package org.voyanttools.trombone.model;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.regex.Pattern;

import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;

/**
 * An enumeration of the known document formats. The names are generic (like TEXT)
 * but can represent a number of file extensions (.txt) and content types (text/plain).
 * The Utilities for guessing file formats should be taken with a grain of salt
 * as most of them are based on simple heuristics to examine file extension or
 * content type (not by reading the actual file).
 * 
 * @author Stéfan Sinclair
 */
public enum DocumentFormat {

	/**
	 * A PDF document (.pdf)
	 */
	PDF("pdf"),
	
	/**
	 * An HTML document (.htm, .html, .xhtml).
	 */
	HTML("html", "htm", "xhtml"),

	/**
	 * An XML document (.xml).
	 */
	XML("xml"),

	/**
	 * An XML document (.xml).
	 */
	RSS("xml"),

	/**
	 * An XML document (.xml).
	 */
	RSS2("xml"),

	/**
	 * An XML document (.xml).
	 */
	ATOM("xml"),

	/**
	 * An XML document (.xml).
	 */
	TEI("xml"),

	/**
	 * An XML document (.xml).
	 */
	DOCSOUTH("xml"),

	/**
	 * An XML document (.xml).
	 */
	TEICORPUS("xml"),
	
	/**
	 * Specialized format for treating EEBO XML files
	 */
	EEBODREAM("xml"),
	
	/**
	 * Specialized format for treating EEBO XML files
	 */
	HYPERLISTES("xml"),
	
	/**
	 * Specialized format for treating EEBO XML files
	 */
	SATORBASE("xml"),
	
	/**
	 * Specialized format for treating Dynamic Table of Context files
	 */
	DTOC("xml"),
	
	/**
	 * An MS Word file (.doc).
	 */
	MSWORD("doc"),
	
	/**
	 * An MS Word XML file (.docx).
	 */
	MSWORDX("docx"),
	
	TOUCHER("docx", "doc"),
	
	/**
	 * An MS Excel file (.xslx).
	 */
	XLSX("xlsx"),
	
	/**
	 * An RTF file (.rtf).
	 */
	RTF("rtf"),
	
	/**
	 * An Apple Pages file (.pages)
	 */
	PAGES("pages"),
	
	/**
	 * An Open Document file (.odt).
	 */
	ODT("odt"),
	
	/**
	 * An EPUB file (.epub).
	 */
	EPUB("epub"),
	
	/**
	 * For Old Bailey adapter http://www.oldbaileyonline.org/obapi/
	 */
	OBAPISEARCHJSON("json"),
	
	/**
	 * Old Bailey XML
	 */
	OLDBAILEYXML("xml"),
	
	/**
	 * An archive file ("ar", "cpio", "dump", "jar", "tar", "tgz", "tbz2", "zip")
	 */
	ARCHIVE("zip", "cpio", "dump", "jar", "tar.gz", "tar", "tgz", "ar"),
	
	/**
	 * Specialized bundle for PBLit
	 */
	PBLIT("zip"),
	
	BAGIT("zip"),
	
	/**
	 * A compressed file ("bzip2", "bz2", "gzip", "gz", "pack200", "xz")
	 */
	COMPRESSED("gz", "bz2", "gzip", "bzip2", "pack200", "xz"),
	
	/**
	 * A file that will be skipped ("png", "gif", "jpg", "jpeg", "bmp", "psd", "css", "js", "json")
	 */
	SKIPPABLE("png", "gif", "jpg", "jpeg", "bmp", "psd", "css", "js", "json") {
		@Override
		public boolean isSkippable() {
			return true;
		}
	},
	
	/**
	 * Test files. We'll put this last because of content types that declare things like text/html (we want HTML)
	 */
	TEXT("txt", "text"),

	/**
	 * An unknown file type.
	 */
	UNKNOWN;

	
	/**
	 * Determine if this format can be skipped (based on a list of known formats).
	 * 
	 * @return whether or not this format can be skipped
	 */
	public boolean isSkippable() {
		return false;
		
	}
	
	/**
	 * Determine if this is an XML-based format.
	 * 
	 * @return whether or not this is an XML-based format
	 */
	public boolean isXml() {
		for (String extension : extensions) {
			if (extension.equals("xml")) return true;
		}
		return false;
	}
	
	/**
	 * Determine if this is an XML-based format.
	 * 
	 * @return whether or not this is an XML-based format
	 */
	public boolean isArchive() {
		for (String extension : extensions) {
			for (String ext : ARCHIVE.extensions) {
				if (extension.equals(ext)) return true;
			}
		}
		return false;
	}
	
	public String getDefaultExtension() {
		return extensions.length==0 ? "unknown" : extensions[0];
	}
	
	/**
	 * The valid extensions for this enum instance.
	 */
	private final String[] extensions;

	/**
	 * Constructs a new instance with the specified extensions.
	 * 
	 * @param extensions a list of extensions for this format
	 */
	private DocumentFormat(String... extensions) {
		this.extensions = extensions;
	}

	/**
	 * Get the format based on the file name (and in particular its extension).
	 * 
	 * @param filename the file name
	 * @return the format (UKNOWN if it's not recognized)
	 */
	public static DocumentFormat fromFilename(String filename) {

		String lowerCaseFileName = filename.toLowerCase();
		for (DocumentFormat format : DocumentFormat.values()) {
			for (String ext : format.extensions) {
				if (lowerCaseFileName.endsWith("."+ext)) {
					return format;
				}
			}
		}
		return UNKNOWN;
		
	}

	/**
	 * Get the format based on the file (and in particular the file name's extension).
	 * 
	 * @param file the file to examine
	 * @return the format (UKNOWN if it's not recognized)
	 */
	public static DocumentFormat fromFile(File file) {		
		return fromFilename(file.getName());
	}

	/**
	 * Get the format based on the specified content type (or MIME type). This
	 * is done by looking at known extensions and seeing if the content type
	 * contains any of those strings (e.g. application/xml is XML and text/html
	 * is HTML).
	 * 
	 * @param contentType the content type to examine
	 * @return the format (UKNOWN if it's not recognized)
	 */
	public static DocumentFormat fromContentType(String contentType) {
		contentType = contentType.toUpperCase();
		for (DocumentFormat format : DocumentFormat.values()) {
			for (String ext : format.extensions) {
				if (contentType.contains(ext.toUpperCase())) { // contains
					return format;
				}
			}
		}
		return UNKNOWN;
	}
	
	public static DocumentFormat fromString(String string) throws UnsupportedEncodingException, IOException {
		DefaultDetector detector = new DefaultDetector();
		MediaType mediaType = detector.detect(new ByteArrayInputStream(string.getBytes("UTF-8")), new Metadata());
		DocumentFormat format = DocumentFormat.fromContentType(mediaType.toString());
		
		if (format==DocumentFormat.TEXT) {
			String trimmed = string.trim();
			// quick and dirty check, particularly the end tag syntax
			if (trimmed.startsWith("<") && trimmed.endsWith(">") && Pattern.compile("</\\w+").matcher(trimmed).find()) {
				format = DocumentFormat.XML;
			}
		}
		return format;
		
	}

	/**
	 * Determine if the file can be skipped (based on a list of known formats).
	 * 
	 * @param file the File to examine
	 * @return whether or not this format can be skipped
	 */
	public static boolean isSkippable(File file) {
		if (file.isHidden() || file.getName().startsWith(".")) return true;
		if (file.getName().startsWith("__")) return true;
		return fromFile(file).isSkippable();
	}

	public static DocumentFormat getForgivingly(String string) {
		for (DocumentFormat documentFormat : values()) {
			if (documentFormat.name().equalsIgnoreCase(string)) {
				return documentFormat;
			}
		}
		return DocumentFormat.UNKNOWN;
	}
}