package eu.europeana.cloud.migrator.provider; import eu.europeana.cloud.common.model.DataProviderProperties; import eu.europeana.cloud.migrator.ResourceMigrator; import org.apache.log4j.Logger; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.FileSystems; import java.nio.file.Files; import java.nio.file.InvalidPathException; import java.nio.file.Path; import java.util.*; public class EuropeanaNewspapersResourceProvider extends DefaultResourceProvider { public static final String IMAGE_DIR = "image"; private Map<String, String> reversedMapping = new HashMap<String, String>(); private Map<String, String> duplicateMapping = new HashMap<String, String>(); private Map<String, Integer> fileCounts = new HashMap<String, Integer>(); private static final Logger logger = Logger.getLogger(EuropeanaNewspapersResourceProvider.class); public EuropeanaNewspapersResourceProvider(String representationName, String mappingFile, String locations, String dataProviderId) throws IOException { super(representationName, mappingFile, locations, dataProviderId); if (dataProviderId == null) throw new IllegalArgumentException("Data provider identifier must be specified for Europeana Newspapers migration!"); readMappingFile(); } /** * Reads mapping file given while constructing this object. * File must be a csv file with ; delimited lists of local identifier and paths to files of the issue. * Encoding is UTF-8. */ private void readMappingFile() throws IOException { Path mappingPath = null; try { // try to treat the mapping file as local file mappingPath = FileSystems.getDefault().getPath(".", mappingFile); if (!mappingPath.toFile().exists()) mappingPath = FileSystems.getDefault().getPath(mappingFile); } catch (InvalidPathException e) { // in case path cannot be created try to treat the mapping file as absolute path mappingPath = FileSystems.getDefault().getPath(mappingFile); logger.info("Invalid Path exception. Mapping file " + mappingFile + " as absolute path: " + mappingPath); } if (mappingPath == null || !mappingPath.toFile().exists()) throw new IOException("Mapping file cannot be found: " + mappingFile); String localId; String path; List<String> paths = new ArrayList<String>(); BufferedReader reader = null; try { reader = Files.newBufferedReader(mappingPath, Charset.forName("UTF-8")); for (; ; ) { String line = reader.readLine(); if (line == null) break; StringTokenizer tokenizer = new StringTokenizer(line, ";"); // first token is local identifier if (tokenizer.hasMoreTokens()) localId = tokenizer.nextToken().trim(); else localId = null; if (localId == null) { logger.warn("Local identifier is null (" + localId + "). Skipping line."); continue; } boolean duplicate = false; paths.clear(); int count = 0; while (tokenizer.hasMoreTokens()) { path = tokenizer.nextToken().trim(); // when path is empty do not add to map if (path.isEmpty()) continue; if (reversedMapping.get(path) != null && !duplicate) { logger.warn("File " + path + " already has a local id = " + reversedMapping.get(path) + ". New local id = " + localId); duplicate = true; for (String s : paths) { reversedMapping.remove(s); duplicateMapping.put(s, localId.intern()); } } if (duplicate) // add reversed mapping to duplicates map duplicateMapping.put(path, localId.intern()); else { // add reversed mapping reversedMapping.put(path.intern(), localId.intern()); paths.add(path.intern()); } count++; } fileCounts.put(localId.intern(), Integer.valueOf(count)); } } finally { if (reader != null) reader.close(); } } @Override public String getResourceProviderId(String path) { if (!path.contains(IMAGE_DIR)) { logger.error("No image directory found in resource path."); return null; } int pos = path.indexOf(IMAGE_DIR); String rest = path.substring(pos + IMAGE_DIR.length()); if (rest.startsWith(ResourceMigrator.LINUX_SEPARATOR) || rest.startsWith(ResourceMigrator.WINDOWS_SEPARATOR)) rest = rest.substring(1); pos = rest.indexOf(ResourceMigrator.LINUX_SEPARATOR); if (pos == -1) pos = rest.indexOf(ResourceMigrator.WINDOWS_SEPARATOR); return rest.substring(0, pos > -1 ? pos : rest.length()); } @Override public String getDataProviderId(String path) { // path is not used to determine data provider, always use the configured data provider return dataProviderId; } @Override public DataProviderProperties getDataProviderProperties(String path) { String id = getDataProviderId(path); if (id == null) { // something is wrong, data provider not specified, this should never happen throw new IllegalArgumentException("Data provider identifier must be specified for Europeana Newspapers migration!"); } File f = new File(path); // when file is id.properties return properties from file if (f.exists() && f.isFile() && f.getName().equals(id + PROPERTIES_EXTENSION)) return getDataProviderPropertiesFromFile(f); // when file is directory try to search for file id.properties inside if (f.isDirectory()) { File dpFile = new File(f, id + PROPERTIES_EXTENSION); if (dpFile.exists()) return getDataProviderPropertiesFromFile(dpFile); } return getDefaultDataProviderProperties(); } @Override public String getLocalIdentifier(String location, String path, boolean duplicate) { // first get the local path within location String localPath = getLocalPath(location, path); // we have to find the identifier in the mapping file String localId = duplicate ? duplicateMapping.get(localPath) : reversedMapping.get(localPath); // when searching in normal mapping and id is not found display a warning if (localId == null && !duplicate) logger.warn("Local identifier for file " + localPath + " was not found in the mapping file!"); return localId; } private String getLocalPath(String location, String path) { int i = path.indexOf(location); if (i == -1) return path; return path.substring(i + location.length() + 1); } @Override public int getFileCount(String localId) { Integer count = fileCounts.get(localId); if (count == null) return -1; return count.intValue(); } @Override public List<FilePaths> split(List<FilePaths> paths) { List<FilePaths> result = new ArrayList<FilePaths>(); for (FilePaths fp : paths) { result.addAll(split(fp, true)); } return result; } private List<FilePaths> split(FilePaths fp, boolean year) { // split will be done for every newspaper title which is the directory just inside the provider directory List<FilePaths> result = new ArrayList<FilePaths>(); Map<String, List<String>> titlePaths = new HashMap<String, List<String>>(); BufferedReader pathsReader = fp.getPathsReader(); try { for (; ; ) { String path = pathsReader.readLine(); if (path == null) break; int i = path.indexOf(fp.getLocation()); i = path.indexOf(fp.getDataProvider(), i == -1 ? 0 : i + fp.getLocation().length() + 1); if (i == -1) { // no data provider name in path, strange so return the FilePaths object unchanged regardless the other paths could contain provider name result.add(fp); return result; } String title = path.substring(i + fp.getDataProvider().length() + 1); i = title.indexOf(ResourceMigrator.LINUX_SEPARATOR); if (i == -1) { // no directory found in path, strange so return the FilePaths object unchanged regardless the other paths result.add(fp); return result; } if (year) { // add year to title, for every year of a title there will be a separate thread // find next separator int j = title.indexOf(ResourceMigrator.LINUX_SEPARATOR, i + 1); String yearStr = title.substring(i + 1, j); if (yearStr.length() < 4) { i = title.indexOf(ResourceMigrator.LINUX_SEPARATOR, j + 1); } else i = j; } title = title.substring(0, i); if (titlePaths.get(title) == null) { titlePaths.put(title, new ArrayList<String>()); } titlePaths.get(title).add(path); } } catch (IOException e) { logger.error("Cannot read paths file for location " + fp.getLocation() + " and provider " + fp.getDataProvider()); } finally { if (pathsReader != null) { try { pathsReader.close(); } catch (IOException e) { e.printStackTrace(); } } } if (titlePaths.size() == 1) { // all paths belong to the same title so no need to create a new FilePaths object as it would be the same as the input one result.add(fp); } else { // now create FilePaths object for every newspapers title for (Map.Entry<String, List<String>> entry : titlePaths.entrySet()) { FilePaths filePaths = new FilePaths(fp.getLocation(), fp.getDataProvider()); filePaths.setIdentifier(entry.getKey().replace(ResourceMigrator.LINUX_SEPARATOR, "_")); filePaths.useFile(filePaths.getIdentifier()); filePaths.addPaths(entry.getValue()); result.add(filePaths); } } return result; } @Override public boolean usePathsFile() { return true; } @Override public Map<String, String> getReversedMapping() { return reversedMapping; } }