/*******************************************************************************
* Trombone is a flexible text processing and analysis library used
* primarily by Voyant Tools (voyant-tools.org).
*
* Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell
*
* This file is part of Trombone.
*
* Trombone is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Trombone is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Trombone. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package org.voyanttools.trombone.model;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.regex.Pattern;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
/**
* An enumeration of the known document formats. The names are generic (like TEXT)
* but can represent a number of file extensions (.txt) and content types (text/plain).
* The Utilities for guessing file formats should be taken with a grain of salt
* as most of them are based on simple heuristics to examine file extension or
* content type (not by reading the actual file).
*
* @author Stéfan Sinclair
*/
public enum DocumentFormat {
/**
* A PDF document (.pdf)
*/
PDF("pdf"),
/**
* An HTML document (.htm, .html, .xhtml).
*/
HTML("html", "htm", "xhtml"),
/**
* An XML document (.xml).
*/
XML("xml"),
/**
* An XML document (.xml).
*/
RSS("xml"),
/**
* An XML document (.xml).
*/
RSS2("xml"),
/**
* An XML document (.xml).
*/
ATOM("xml"),
/**
* An XML document (.xml).
*/
TEI("xml"),
/**
* An XML document (.xml).
*/
DOCSOUTH("xml"),
/**
* An XML document (.xml).
*/
TEICORPUS("xml"),
/**
* Specialized format for treating EEBO XML files
*/
EEBODREAM("xml"),
/**
* Specialized format for treating EEBO XML files
*/
HYPERLISTES("xml"),
/**
* Specialized format for treating EEBO XML files
*/
SATORBASE("xml"),
/**
* Specialized format for treating Dynamic Table of Context files
*/
DTOC("xml"),
/**
* An MS Word file (.doc).
*/
MSWORD("doc"),
/**
* An MS Word XML file (.docx).
*/
MSWORDX("docx"),
TOUCHER("docx", "doc"),
/**
* An MS Excel file (.xslx).
*/
XLSX("xlsx"),
/**
* An RTF file (.rtf).
*/
RTF("rtf"),
/**
* An Apple Pages file (.pages)
*/
PAGES("pages"),
/**
* An Open Document file (.odt).
*/
ODT("odt"),
/**
* An EPUB file (.epub).
*/
EPUB("epub"),
/**
* For Old Bailey adapter http://www.oldbaileyonline.org/obapi/
*/
OBAPISEARCHJSON("json"),
/**
* Old Bailey XML
*/
OLDBAILEYXML("xml"),
/**
* An archive file ("ar", "cpio", "dump", "jar", "tar", "tgz", "tbz2", "zip")
*/
ARCHIVE("zip", "cpio", "dump", "jar", "tar.gz", "tar", "tgz", "ar"),
/**
* Specialized bundle for PBLit
*/
PBLIT("zip"),
BAGIT("zip"),
/**
* A compressed file ("bzip2", "bz2", "gzip", "gz", "pack200", "xz")
*/
COMPRESSED("gz", "bz2", "gzip", "bzip2", "pack200", "xz"),
/**
* A file that will be skipped ("png", "gif", "jpg", "jpeg", "bmp", "psd", "css", "js", "json")
*/
SKIPPABLE("png", "gif", "jpg", "jpeg", "bmp", "psd", "css", "js", "json") {
@Override
public boolean isSkippable() {
return true;
}
},
/**
* Test files. We'll put this last because of content types that declare things like text/html (we want HTML)
*/
TEXT("txt", "text"),
/**
* An unknown file type.
*/
UNKNOWN;
/**
* Determine if this format can be skipped (based on a list of known formats).
*
* @return whether or not this format can be skipped
*/
public boolean isSkippable() {
return false;
}
/**
* Determine if this is an XML-based format.
*
* @return whether or not this is an XML-based format
*/
public boolean isXml() {
for (String extension : extensions) {
if (extension.equals("xml")) return true;
}
return false;
}
/**
* Determine if this is an XML-based format.
*
* @return whether or not this is an XML-based format
*/
public boolean isArchive() {
for (String extension : extensions) {
for (String ext : ARCHIVE.extensions) {
if (extension.equals(ext)) return true;
}
}
return false;
}
public String getDefaultExtension() {
return extensions.length==0 ? "unknown" : extensions[0];
}
/**
* The valid extensions for this enum instance.
*/
private final String[] extensions;
/**
* Constructs a new instance with the specified extensions.
*
* @param extensions a list of extensions for this format
*/
private DocumentFormat(String... extensions) {
this.extensions = extensions;
}
/**
* Get the format based on the file name (and in particular its extension).
*
* @param filename the file name
* @return the format (UKNOWN if it's not recognized)
*/
public static DocumentFormat fromFilename(String filename) {
String lowerCaseFileName = filename.toLowerCase();
for (DocumentFormat format : DocumentFormat.values()) {
for (String ext : format.extensions) {
if (lowerCaseFileName.endsWith("."+ext)) {
return format;
}
}
}
return UNKNOWN;
}
/**
* Get the format based on the file (and in particular the file name's extension).
*
* @param file the file to examine
* @return the format (UKNOWN if it's not recognized)
*/
public static DocumentFormat fromFile(File file) {
return fromFilename(file.getName());
}
/**
* Get the format based on the specified content type (or MIME type). This
* is done by looking at known extensions and seeing if the content type
* contains any of those strings (e.g. application/xml is XML and text/html
* is HTML).
*
* @param contentType the content type to examine
* @return the format (UKNOWN if it's not recognized)
*/
public static DocumentFormat fromContentType(String contentType) {
contentType = contentType.toUpperCase();
for (DocumentFormat format : DocumentFormat.values()) {
for (String ext : format.extensions) {
if (contentType.contains(ext.toUpperCase())) { // contains
return format;
}
}
}
return UNKNOWN;
}
public static DocumentFormat fromString(String string) throws UnsupportedEncodingException, IOException {
DefaultDetector detector = new DefaultDetector();
MediaType mediaType = detector.detect(new ByteArrayInputStream(string.getBytes("UTF-8")), new Metadata());
DocumentFormat format = DocumentFormat.fromContentType(mediaType.toString());
if (format==DocumentFormat.TEXT) {
String trimmed = string.trim();
// quick and dirty check, particularly the end tag syntax
if (trimmed.startsWith("<") && trimmed.endsWith(">") && Pattern.compile("</\\w+").matcher(trimmed).find()) {
format = DocumentFormat.XML;
}
}
return format;
}
/**
* Determine if the file can be skipped (based on a list of known formats).
*
* @param file the File to examine
* @return whether or not this format can be skipped
*/
public static boolean isSkippable(File file) {
if (file.isHidden() || file.getName().startsWith(".")) return true;
if (file.getName().startsWith("__")) return true;
return fromFile(file).isSkippable();
}
public static DocumentFormat getForgivingly(String string) {
for (DocumentFormat documentFormat : values()) {
if (documentFormat.name().equalsIgnoreCase(string)) {
return documentFormat;
}
}
return DocumentFormat.UNKNOWN;
}
}