/*******************************************************************************
* Trombone is a flexible text processing and analysis library used
* primarily by Voyant Tools (voyant-tools.org).
*
* Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell
*
* This file is part of Trombone.
*
* Trombone is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Trombone is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Trombone. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package org.voyanttools.trombone.input.source;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URLConnection;
import org.apache.commons.codec.digest.DigestUtils;
import org.voyanttools.trombone.model.DocumentFormat;
import org.voyanttools.trombone.model.DocumentMetadata;
/**
* An {@link InputSource} associated with a URI.
*
* @author Stéfan Sinclair
*/
public class UriInputSource implements InputSource {
/**
* the URI for this input source
*/
private URI uri;
/**
* the id (hash) for this input source
*/
private String id;
/**
* the metadata for this input source
*/
private DocumentMetadata metadata;
/**
* Create a new instance with the specified URI.
*
* @param uri the URI associated with this input source
* @throws IOException
* thrown when there's a problem creating or accessing header
* information for the URI
* @throws MalformedURLException
* thrown if the URI is malformed
*/
public UriInputSource(URI uri) throws IOException {
this.uri = uri;
this.metadata = new DocumentMetadata();
this.metadata.setLocation(uri.toString());
this.metadata.setSource(Source.URI);
String path = uri.getPath();
if (path.isEmpty() || path.equals("/")) { // no path, use host
metadata.setTitle(uri.getHost());
}
else if (path.endsWith("/")) { // ends in slash, use full path
metadata.setTitle(path);
}
else { // try to use file part of URI
metadata.setTitle(new File(path).getName());
}
StringBuilder idBuilder = new StringBuilder(uri.toString());
// establish connection to find other and default metadata
URLConnection c = null;
try {
c = getURLConnection(uri, 15000, 10000);
// last modified of file
long modified = c.getLastModified();
this.metadata.setModified(modified);
idBuilder.append(modified);
// try and get length for id
int length = c.getContentLength();
idBuilder.append(length);
String format = c.getContentType();
if (format != null && format.isEmpty() == false) {
idBuilder.append(format);
DocumentFormat docFormat = DocumentFormat
.fromContentType(format);
if (docFormat != DocumentFormat.UNKNOWN) {
this.metadata.setDefaultFormat(docFormat);
}
}
} finally {
if (c != null && c instanceof HttpURLConnection) {
((HttpURLConnection) c).disconnect();
}
}
this.id = DigestUtils.md5Hex(idBuilder.toString());
}
private URLConnection getURLConnection(URI uri) throws IOException {
return getURLConnection(uri, 60000, 15000);
}
private URLConnection getURLConnection(URI uri, int readTimeoutMilliseconds, int connectTimeoutMilliseconds) throws IOException {
URLConnection c;
try {
c = uri.toURL().openConnection();
}
catch (MalformedURLException e) {
throw new IllegalArgumentException("Attempt to use a malformed URL: "+uri, e);
}
c.addRequestProperty("User-Agent", "Mozilla/4.0 (compatible; Trombone)");
c.setReadTimeout(readTimeoutMilliseconds);
c.setConnectTimeout(connectTimeoutMilliseconds);
return c;
}
public InputStream getInputStream() throws MalformedURLException,
IOException {
// let's hope that the connection is close when the stream is closed
URLConnection c = getURLConnection(uri);
return c.getInputStream();
}
public DocumentMetadata getMetadata() {
return this.metadata;
}
public String getUniqueId() {
return this.id;
}
}