/*
* Copyright 2012 Research Studios Austria Forschungsges.m.b.H.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package won.protocol.util.linkeddata;
import net.sf.ehcache.CacheException;
import net.sf.ehcache.Ehcache;
import net.sf.ehcache.Element;
import org.apache.jena.query.Dataset;
import org.apache.jena.query.DatasetFactory;
import org.apache.jena.riot.Lang;
import org.apache.jena.riot.RDFDataMgr;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.cache.ehcache.EhCacheCacheManager;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpStatus;
import won.protocol.rest.DatasetResponseWithStatusCodeAndHeaders;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.net.URI;
import java.text.MessageFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CountDownLatch;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.EnumSet.noneOf;
/**
* LinkedDataSource implementation that uses an ehcache for caching.
*/
@Qualifier("default")
public class CachingLinkedDataSource extends LinkedDataSourceBase implements LinkedDataSource, InitializingBean
{
private static final String CACHE_NAME = "linkedDataCache";
private static final String HTTP_DATE_FORMAT = "EEE, dd MMM yyyy HH:mm:ss z";
private static final int DEFAULT_EXPIRY_PERIOD = 600;
private static final int DEFAULT_BYTE_ARRAY_SIZE = 500;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Autowired(required = true)
private EhCacheCacheManager cacheManager;
private Ehcache cache;
private CrawlerCallback crawlerCallback = null;
//In-memory dataset for caching linked data.
/**
* Removes the element associated with the
* specified URI from the cache
* @param resource
*/
public void invalidate(URI resource) {
assert resource != null : "resource must not be null";
cache.remove(makeCacheKey(resource, null));
}
public void invalidate(URI resource, URI requesterWebID) {
assert (resource != null && requesterWebID != null) : "resource and requester must not be null";
cache.remove(makeCacheKey(resource, requesterWebID));
}
public void clear(){
cache.removeAll();
}
public Dataset getDataForResource(URI resource, URI requesterWebID){
assert resource != null : "resource must not be null";
Element element = null;
try {
element = cache.get(makeCacheKey(resource, requesterWebID));
}catch(CacheException e){
//logging on warn level as not reporting errors here can make misconfiguration hard to detect
logger.warn(String.format("Couldn't fetch resource %s", resource));
logger.debug("Exception is:", e);
return DatasetFactory.createGeneral();
}
LinkedDataCacheEntry linkedDataCacheEntry = null;
if (element != null) {
//cached element found
Object cachedObject = element.getObjectValue();
if (! (cachedObject instanceof LinkedDataCacheEntry)) {
//wrong type - how did that happen?
throw new IllegalStateException(
new MessageFormat("The underlying linkedDataCache should only contain Datasets, but we got a {0} for URI {1}")
.format(new Object[]{cachedObject.getClass(), resource}));
}
linkedDataCacheEntry = (LinkedDataCacheEntry) cachedObject;
}
return fetchOrUseCached(resource, requesterWebID, linkedDataCacheEntry).getDataset();
}
/**
* This method respects the headers 'Expires', 'Cache-Control', and 'ETAG':
* If a cached resource (indicated by a non-null linkedDataCacheEntry) is expired either according to
* the expiry date or the cache-control header from the earlier request, the request will be made.
* When the request is made and an ETAG value is known from an earlier request, it will be sent as the
* 'If-None-Match' header value. In that case the server is expected to answer with status 304 (not modified) and
* the cached response will be used, updating cache control information if the server chooses to send 'Expires' or
* 'Cache-Control' headers.
*
* @param resource the URI of the resource to fetch
* @param requesterWebID optional WebID URI to use for the request
* @param linkedDataCacheEntry optional cache entry to use
* @return
*/
private DatasetResponseWithStatusCodeAndHeaders fetchOrUseCached(final URI resource, final URI
requesterWebID, LinkedDataCacheEntry linkedDataCacheEntry) {
//check
// * if we have a cached result
// * if we can use it
// * make request, possibly using ETAG
// * cache the new result if appropriate
// * if ETAG indicates not modified, return cached result but update caching info
// * return result
DatasetResponseWithStatusCodeAndHeaders responseData = null;
Map<String, String> headers = new HashMap<>();
if (linkedDataCacheEntry != null) {
Date now = new Date();
//before we can return a cached result, make a few checks to see if we
//are allowed to do that:
if (linkedDataCacheEntry.isExpiredAtDate(now)) {
//cache item is expired. Remove from cache and fetch again
cache.remove(makeCacheKey(resource, requesterWebID));
logger.debug("cache item {} expired, fetching again.", resource);
return fetchOnlyOnce(resource, requesterWebID, linkedDataCacheEntry, headers);
}
if (linkedDataCacheEntry.getCacheControlFlags().contains(CacheControlFlag.PRIVATE)){
// in this case we assume that the response is not publicly visible, so it depends on the specified
// requesterWebID. The check is performed by the server. We cannot return a cached response
// immediately, but further down the line the ETAG based system can do that.
logger.debug("cache item {} is Cache-Control:private, will return cached copy only after server checks ETAG, " +
"therefore sending request to server.", resource);
return fetchOnlyOnce(resource, requesterWebID, linkedDataCacheEntry, headers);
}
logger.debug("returning cached version of {}", resource);
//we can use the cached result directly
return linkedDataCacheEntry.recreateResponse();
}
//nothing found in the cache, fetch the resource remotely
logger.debug("Nothing found in cache for {}, fetching remotely", resource);
responseData = fetchOnlyOnce(resource, requesterWebID, null, headers);
//inform the crawler callback
if (crawlerCallback != null){
try {
crawlerCallback.onDatasetCrawled(resource, responseData.getDataset());
} catch (Exception e ){
logger.info(String.format("error during callback execution for dataset %s", resource.toString()), e);
}
}
return responseData;
}
//synchronziation for concurrent requests to the same resource
private ConcurrentMap<String, CountDownLatch> countDownLatchMap = new ConcurrentHashMap<>(10);
/**
* We may run into fetching the same URI multiple times at once. Make sure we make only one http request
* and use the response for every client.
* @param resource
* @param requesterWebID
* @param linkedDataCacheEntry
* @param headers
* @return
*/
private DatasetResponseWithStatusCodeAndHeaders fetchOnlyOnce(final URI resource, final URI requesterWebID, final LinkedDataCacheEntry linkedDataCacheEntry, final Map<String, String> headers) {
String cacheKey = makeCacheKey(resource, requesterWebID);
CountDownLatch latch = new CountDownLatch(1);
CountDownLatch preExistingLatch = countDownLatchMap.putIfAbsent(cacheKey, latch);
try {
if (preExistingLatch != null) {
logger.debug("resource " + cacheKey +" is being fetched in another thread, we wait for its result and use it " +
"if it turns out to be cacheable");
//in this case, another thread is already fetching the URI. Wait.
try {
preExistingLatch.await();
} catch (InterruptedException e) {
logger.warn("interrupted while waiting for another thread to fetch '" + resource + "'");
}
//now, the other thread is done fetching the resource. It may not have been allowed to cache it, in which case
//we have to fetch it again. We try:
Element element = cache.get(cacheKey);
if (element != null) {
logger.debug("resource " + cacheKey +" turned out to be cacheable, using it");
//ok, we'll recreate a response from the cache.
//Caution: this is not a copy, it's the SAME dataset - so manipulating the result causes side-effects.
LinkedDataCacheEntry entry = (LinkedDataCacheEntry) element.getObjectValue();
return entry.recreateResponse();
}
logger.debug("resource " + cacheKey +" did not turn out to be cacheable - fetching it, too");
//so the cache still doesn't have it. We think it's better to let every thread
//fetch it for itself.
}
DatasetResponseWithStatusCodeAndHeaders datasetResponse = fetchAndCacheIfAppropriate(resource, requesterWebID,
linkedDataCacheEntry,
headers);
return datasetResponse;
} finally {
//remove the latch from the map if it is in there
countDownLatchMap.remove(cacheKey, latch);
//wake up all threads that might now be waiting at our latch
latch.countDown();
}
}
private DatasetResponseWithStatusCodeAndHeaders fetchAndCacheIfAppropriate(
final URI resource,
final URI requesterWebID,
final LinkedDataCacheEntry linkedDataCacheEntry,
final Map<String, String> headers) {
DatasetResponseWithStatusCodeAndHeaders responseData = fetchWithEtagValidation(resource,
requesterWebID, linkedDataCacheEntry, headers);
Date expires = parseCacheControlMaxAgeValue(resource, responseData);
if (expires == null) {
expires = parseExpiresHeader(resource, responseData);
}
EnumSet<CacheControlFlag> cacheControlFlags = parseCacheControlHeaderFlags(resource, responseData);
if (cacheControlFlags.contains(CacheControlFlag.NO_STORE) || cacheControlFlags.contains(CacheControlFlag.NO_CACHE)){
//we are not allowed to cache the result
//make sure it's not in the cache from a previous request
cache.remove(makeCacheKey(resource, requesterWebID));
logger.debug("Fetched {}. Will not be cached due to Cache-Control headers sent by server",
resource);
return responseData;
}
Date responseDate = parseDateHeader(resource, responseData);
if (responseDate != null && expires != null){
//old way of saying don't cache: Date header >= Expires header
if (responseDate.equals(expires) || responseDate.after(expires)) {
//we are not allowed to cache the result
//make sure it's not in the cache from a previous request
logger.debug("Fetched {}. Will not be cached due to Expires/Date header combination sent by server", resource);
cache.remove(makeCacheKey(resource, requesterWebID));
return responseData;
}
}
//if we don't get a new etag, see if we have a 304 code - then we can use th old etag
String etag = responseData.getResponseHeaders().get(HttpHeaders.ETAG);
if (etag == null && responseData.getStatusCode() == HttpStatus.NOT_MODIFIED.value()
&& linkedDataCacheEntry != null){
etag = linkedDataCacheEntry.getEtag();
}
//cache the result
LinkedDataCacheEntry entry = new LinkedDataCacheEntry(etag, expires, writeDatasetToByteArray(responseData.getDataset()),
cacheControlFlags, responseData.getResponseHeaders(),
responseData.getStatusCode());
this.cache.put(new Element(makeCacheKey(resource, requesterWebID), entry));
logger.debug("Fetched and cached {} ", resource);
return responseData;
}
private static Dataset readDatasetFromByteArray(byte[] datasetbytes) {
Dataset dataset = DatasetFactory.create();
RDFDataMgr.read(dataset, new ByteArrayInputStream(datasetbytes), Lang.NQUADS);
return dataset;
}
private static byte[] writeDatasetToByteArray(Dataset dataset) {
ByteArrayOutputStream out = new ByteArrayOutputStream(DEFAULT_BYTE_ARRAY_SIZE);
RDFDataMgr.write(out, dataset, Lang.NQUADS);
return out.toByteArray();
}
/**
* Checks if the cached entry has an ETAG value set and uses the 'If-None-Match' header if this is the case.
* If the server responds with 304 - NOT_MODIFIED, the cached dataset replaces the (empty) dataset coming from the
* server in the DatasetResponseWithStatusCodeAndHeaders.
*
* @param resource
* @param requesterWebID
* @param linkedDataCacheEntry
* @param headers
* @return
*/
private DatasetResponseWithStatusCodeAndHeaders fetchWithEtagValidation(
final URI resource,
final URI requesterWebID,
final LinkedDataCacheEntry linkedDataCacheEntry,
final Map<String, String> headers) {
if (linkedDataCacheEntry == null || linkedDataCacheEntry.getEtag() == null){
logger.debug("fetching from server without ETAG validation: {} ", resource);
return fetch(resource, requesterWebID, headers);
}
//we already have an etag - use it for validating
Map<String, String> myHeaders = headers != null ? headers : new HashMap<>();
myHeaders.put(HttpHeaders.IF_NONE_MATCH, linkedDataCacheEntry.getEtag());
logger.debug("fetching from server with ETAG validation: {} ", resource);
DatasetResponseWithStatusCodeAndHeaders datasetResponse = fetch(resource, requesterWebID,
myHeaders);
if (datasetResponse.getStatusCode() == HttpStatus.NOT_MODIFIED.value()){
//replace dataset in response with the cached dataset
logger.debug("server said our ETAG is still valid, using cached dataset for URI {} ", resource);
datasetResponse = new DatasetResponseWithStatusCodeAndHeaders(readDatasetFromByteArray(linkedDataCacheEntry
.getDataset()),
datasetResponse
.getStatusCode(),
datasetResponse
.getResponseHeaders());
} else {
logger.debug("server said our ETAG is not valid, not using cached result for URI {} ", resource);
// We would like to remove the item from the cache immediately because it is now outdated. However, we cannot
// remove the cached result from the cache here because we may have gotten any response from the
// server (i.e. 1xx, 2xx, 3xx, 4xx, 5xx). However, if the ETAG isn't valid, we'll overwrite the cache entry down
// the line or remove it if the server decides to forbid caching.
}
return datasetResponse;
}
/**
* Performs the actual request via the linkedDataRestClient.
* @param resource
* @param requesterWebID
* @param headers
* @return
*/
private DatasetResponseWithStatusCodeAndHeaders fetch(final URI resource, final URI
requesterWebID, final Map<String, String> headers) {
final DatasetResponseWithStatusCodeAndHeaders responseData;
if (requesterWebID != null){
logger.debug("fetching linked data for URI {} with WebID {}", resource, requesterWebID);
responseData = linkedDataRestClient.readResourceDataWithHeaders(resource, requesterWebID, headers);
} else {
logger.debug("fetching linked data for URI {} without WebID", resource, requesterWebID);
responseData = linkedDataRestClient.readResourceDataWithHeaders(resource, headers);
}
return responseData;
}
private Date parseExpiresHeader(final URI resource, final DatasetResponseWithStatusCodeAndHeaders responseData) {
String expiresHeader = responseData.getResponseHeaders().get(HttpHeaders.EXPIRES);
if (expiresHeader == null) {
return null;
}
expiresHeader = expiresHeader.trim();
SimpleDateFormat format = new SimpleDateFormat(HTTP_DATE_FORMAT, Locale.ENGLISH);
Date expires = null;
try {
expires = format.parse(expiresHeader);
} catch (ParseException e) {
//cannot parse expires header - use a default
expires = addNSecondsToNow(DEFAULT_EXPIRY_PERIOD);
//TODO: there seems to be a problem with the Expires header from the LinkedDataService
logger.debug("could not parse 'Expires' header ' "
+ expiresHeader +"' obtained for '" + resource + "', using default expiry period of " + DEFAULT_EXPIRY_PERIOD
+" " +
"seconds");
}
return expires;
}
private Date addNSecondsToNow(int seconds) {
final Date expires;Calendar cal = Calendar.getInstance();
cal.setTime(new Date());
cal.add(Calendar.SECOND, seconds);
expires = cal.getTime();
return expires;
}
private Date parseDateHeader(final URI resource, final DatasetResponseWithStatusCodeAndHeaders
responseData) {
String dateHeader = responseData.getResponseHeaders().get(HttpHeaders.DATE);
if (dateHeader == null){
return null;
}
SimpleDateFormat format = new SimpleDateFormat(HTTP_DATE_FORMAT, Locale.ENGLISH);
Date date = null;
try {
date = format.parse(dateHeader);
} catch (ParseException e) {
//cannot parse expires header - use a default
date = new Date();
logger.warn("could not parse 'Date' header ' "
+ dateHeader +"' obtained for '" + resource + "', using current date");
}
return date;
}
private EnumSet<CacheControlFlag> parseCacheControlHeaderFlags(final URI resource, final DatasetResponseWithStatusCodeAndHeaders responseData) {
String cacheControlHeaderValue = responseData.getResponseHeaders().get(HttpHeaders.CACHE_CONTROL);
EnumSet<CacheControlFlag> cacheControlFlags = EnumSet.noneOf(CacheControlFlag.class);
if (cacheControlHeaderValue == null) return cacheControlFlags;
String[] values = cacheControlHeaderValue.split(",");
for (String value : values){
CacheControlFlag flag = CacheControlFlag.forName(value.trim());
if (flag != null) {
cacheControlFlags.add(flag);
}
}
return cacheControlFlags;
}
private Date parseCacheControlMaxAgeValue(final URI resource, final DatasetResponseWithStatusCodeAndHeaders responseData) {
String cacheControlHeaderValue = responseData.getResponseHeaders().get(HttpHeaders.CACHE_CONTROL);
if (cacheControlHeaderValue == null) return null;
Pattern maxagePattern = Pattern.compile("[^\\s,]*max-age\\s*=\\s*(\\d+)[$\\s,]");
Matcher m = maxagePattern.matcher(cacheControlHeaderValue);
if (!m.find()) return null;
String maxAgeValueString = m.group(1);
int maxAgeInt = 3600;
try {
maxAgeInt = Integer.parseInt(maxAgeValueString);
} catch (NumberFormatException e){
logger.warn("could not parse 'Expires' header ' "
+ cacheControlHeaderValue +"' obtained for '" + resource + "' using default expiry period of 1 hour",e );
}
Calendar cal = Calendar.getInstance();
cal.setTime(new Date());
cal.add(Calendar.SECOND, maxAgeInt);
return cal.getTime();
}
@Override
public Dataset getDataForResource(final URI resource) {
return getDataForResource(resource, null);
}
@Override
public void afterPropertiesSet() throws Exception
{
Ehcache baseCache = cacheManager.getCacheManager().getCache(CACHE_NAME);
if (baseCache == null) {
throw new IllegalArgumentException(String.format("could not find a cache with name '%s' in ehcache config",
CACHE_NAME));
}
//this.cache = new SelfPopulatingCache(baseCache, new LinkedDataCacheEntryFactory());
this.cache = baseCache;
}
public void setCacheManager(final EhCacheCacheManager cacheManager)
{
this.cacheManager = cacheManager;
}
@Autowired(required = false)
public void setCrawlerCallback(final CrawlerCallback crawlerCallback) {
this.crawlerCallback = crawlerCallback;
}
public static enum CacheControlFlag
{
PUBLIC("public"),
PRIVATE("private"),
NO_CACHE("no-cache"),
NO_STORE("no-store"),
MUST_REVALIDATE("must-revalidate")
;
private String name;
CacheControlFlag(final String name) {
this.name = name;
}
public static CacheControlFlag forName(String name){
switch (name){
case "public": return PUBLIC;
case "private": return PRIVATE;
case "no-cache": return NO_CACHE;
case "no-store": return NO_STORE;
case "must-revalidate": return MUST_REVALIDATE;
}
return null;
}
public String getName() {
return name;
}
}
public static class LinkedDataCacheEntry
{
private String etag = null;
private Date expires = null;
private byte[] dataset = null;
private EnumSet<CacheControlFlag> cacheControlFlags = noneOf(CacheControlFlag.class);
private Map<String, String> headers;
private int statusCode;
public LinkedDataCacheEntry(final String etag, final Date expires, final byte[] dataset, final EnumSet<CacheControlFlag> cacheControlFlags, final Map<String, String> headers, final int statusCode) {
this.etag = etag;
this.expires = expires;
this.dataset = dataset;
this.cacheControlFlags = cacheControlFlags != null ? cacheControlFlags : noneOf(CacheControlFlag.class);
this.headers = headers;
this.statusCode = statusCode;
}
public DatasetResponseWithStatusCodeAndHeaders recreateResponse(){
return new DatasetResponseWithStatusCodeAndHeaders(readDatasetFromByteArray(dataset), statusCode, headers);
}
public String getEtag() {
return etag;
}
public byte[] getDataset() {
return dataset;
}
public Date getExpires() {
return expires;
}
public EnumSet<CacheControlFlag> getCacheControlFlags() {
return cacheControlFlags;
}
/**
* Checks if the cache item is expired at the given date.
* If the cache item has no expiry date set, the method returns false for any given date.
* @param when
* @return
*/
public boolean isExpiredAtDate(final Date when) {
if (expires == null) return false;
return expires.before(when);
}
}
private String makeCacheKey(URI resource, URI requesterWebID){
//using spaces in the null placeholder to make it impossible to inject a requesterWebID URI that is equal to the
//null place holder (because an URI can't have spaces).
return resource.toString() + (requesterWebID == null ? " (no Web ID)":requesterWebID.toString());
}
}