package org.baderlab.csplugins.enrichmentmap.resolver; import java.io.IOException; import java.io.UncheckedIOException; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.EnumMap; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.function.BinaryOperator; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; import org.baderlab.csplugins.enrichmentmap.model.DataSetFiles; import org.baderlab.csplugins.enrichmentmap.model.EMDataSet.Method; import com.google.common.collect.ImmutableList; public class DataSetResolver { public static enum Type { ENRICHMENT_BINGO, ENRICHMENT_DAVID, ENRICHMENT_GENERIC, ENRICHMENT_GREAT, ENRICHMENT_GSEA, GSEA_FOLDER, EXPRESSION, RANKS, IGNORE; public boolean isEnrichmentFile() { return this.name().startsWith("ENRICHMENT"); } } public static List<DataSetParameters> guessDataSets(Path rootFolder, CancelStatus cancelStatus) { if(cancelStatus == null) { cancelStatus = new CancelStatus() { @Override public boolean isCancelled() { return false; } @Override public void cancel() { } }; } // First test if rootFolder is itself a GSEA results folder Optional<DataSetParameters> dataset = GSEAResolver.resolveGSEAResultsFolder(rootFolder); if(dataset.isPresent()) return ImmutableList.of(dataset.get()); if(cancelStatus.isCancelled()) return Collections.emptyList(); try(Stream<Path> contents = Files.list(rootFolder)) { Map<Type,List<Path>> types = new EnumMap<>(Type.class); for(Type type : Type.values()) { types.put(type, new ArrayList<>()); } for(Path path : (Iterable<Path>)contents::iterator) { if(cancelStatus.isCancelled()) return Collections.emptyList(); Type type = guessType(path); types.get(type).add(path); } if(cancelStatus.isCancelled()) return Collections.emptyList(); return createDataSets(types); } catch(IOException e) { e.printStackTrace(); return Collections.emptyList(); } } private static List<DataSetParameters> createDataSets(Map<Type,List<Path>> types) { List<DataSetParameters> dataSets = new ArrayList<>(); // All GSEA results are fine for(Path gseaFolder : types.get(Type.GSEA_FOLDER)) { Optional<DataSetParameters> gseaDataSet = GSEAResolver.resolveGSEAResultsFolder(gseaFolder); if(gseaDataSet.isPresent()) dataSets.add(gseaDataSet.get()); } // Now, iterate over Enrichments, and try to pair up with Ranks and Expressions // MKTODO add other enrichment types List<Path> expressionFiles = new ArrayList<>(types.get(Type.EXPRESSION)); List<Path> rankFiles = new ArrayList<>(types.get(Type.RANKS)); // MKTODO what about other enrichment types? for(Path enrichment : types.get(Type.ENRICHMENT_GENERIC)) { DataSetFiles files = new DataSetFiles(); files.setEnrichmentFileName1(enrichment.toAbsolutePath().toString()); Optional<Path> closestExpression = findClosestMatch(enrichment, expressionFiles); Optional<Path> closestRanks = findClosestMatch(enrichment, rankFiles); closestExpression.ifPresent(path -> { expressionFiles.remove(path); files.setExpressionFileName(path.toAbsolutePath().toString()); }); closestRanks.ifPresent(path -> { rankFiles.remove(path); files.setRankedFile(path.toAbsolutePath().toString()); }); String name = getDatasetNameGeneric(enrichment.getFileName()); dataSets.add(new DataSetParameters(name, Method.Generic, files)); } return dataSets; } private static Optional<Path> findClosestMatch(Path p, List<Path> candidates) { String pf = p.getFileName().toString(); Map<Path,Integer> scores = new HashMap<>(); for(Path candidate : candidates) { String filename = candidate.getFileName().toString(); int score1 = StringUtils.getFuzzyDistance(pf, filename, Locale.getDefault()); scores.put(candidate, score1); } // Find closest match by using edit distance on file name; Optional<Path> closest = candidates.stream().reduce(BinaryOperator.maxBy(Comparator.comparing(scores::get))); // There should be a threshold for considering the path a match // MKTODO can this heuristic be improved? if(closest.isPresent()) { int score = scores.get(closest.get()); if(score == 0) { return Optional.empty(); } } return closest; } public static Type guessType(Path path) { try { if(Files.isHidden(path)) return Type.IGNORE; } catch (IOException e) { e.printStackTrace(); return Type.IGNORE; } if(Files.isDirectory(path)) { if(GSEAResolver.isGSEAResultsFolder(path)) { return Type.GSEA_FOLDER; } else { return Type.IGNORE; } } Optional<String> firstLine = getFirstDataLine(path); if(!(firstLine.isPresent() && isTabSeparated(firstLine.get()))) { return Type.IGNORE; } return guess(path, firstLine.get()); } private static Type guess(Path path, String firstLine) { Map<Type,Integer> scores = new EnumMap<>(Type.class); // Guess based on extension and/or first line of file if(hasExtension(path, "gct")) { addScore(scores, Type.RANKS, 1); } if(hasExtension(path, "rnk")) { addScore(scores, Type.RANKS, 1); addScore(scores, Type.EXPRESSION, 1); } if(hasExtension(path, "xls", "bgo", "tsv", "txt")) { Type type = guessEnrichmentType(path); if(type == Type.IGNORE) { addScore(scores, Type.ENRICHMENT_GENERIC, 1); addScore(scores, Type.EXPRESSION, 1); } else { addScore(scores, type, 2); // this is a lot of evidence } } // Test first line if(!isRankLine(firstLine)) { addScore(scores, Type.RANKS, -1); } if(!isExpressionLine(firstLine)) { addScore(scores, Type.EXPRESSION, -1); } // Guess based on file name String fileName = path.getFileName().toString(); if(matches(fileName, ".*expr(ession)?.*")) { addScore(scores, Type.EXPRESSION, 3); } if(matches(fileName, ".*rank.*")) { addScore(scores, Type.RANKS, 3); } // Not adding score here for enrichment files because guessEnrichmentType should be enough evidence Set<Type> possibleTypes = typesWithHighestScore(scores); if(possibleTypes.isEmpty()) { return Type.IGNORE; } if(possibleTypes.size() == 1) { return possibleTypes.iterator().next(); } // Here we hardcode a tiebreaker if(possibleTypes.contains(Type.EXPRESSION)) { return Type.EXPRESSION; } if(possibleTypes.contains(Type.RANKS)) { return Type.RANKS; } return possibleTypes.iterator().next(); } private static <K> void addScore(Map<K,Integer> map, K key, int add) { map.merge(key, add, (x,y) -> x + y); } private static Set<Type> typesWithHighestScore(Map<Type,Integer> scores) { if(scores.isEmpty()) return Collections.emptySet(); int highestScore = Integer.MIN_VALUE; Set<Type> highest = new HashSet<>(); for(Type type : scores.keySet()) { int score = scores.get(type); if(score == highestScore) { highest.add(type); } if(score > highestScore) { highestScore = score; highest = new HashSet<>(); highest.add(type); } } return highest; } private static boolean matches(String s, String regex) { Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(s); return matcher.matches(); } private static boolean isDouble(String x) { try { Double.parseDouble(x); return true; } catch(NumberFormatException e) { return false; } } private static boolean hasExtension(Path path, String... extensions) { String fileName = path.getFileName().toString(); for(String ext : extensions) { if(fileName.endsWith("." + ext.toLowerCase()) || fileName.endsWith("." + ext.toUpperCase())) { return true; } } return false; } private static Optional<String> getFirstDataLine(Path path) { try(Stream<String> lines = Files.lines(path)) { return lines .filter(l -> !l.startsWith("#")) // filter out comment lines .skip(1) // skip header line .findFirst(); } catch(IOException | UncheckedIOException e) { return Optional.empty(); } } private static boolean isExpressionLine(String line) { String[] tokens = line.split("\t"); if(tokens.length > 2) { return Arrays.stream(tokens).skip(2).allMatch(DataSetResolver::isDouble); } else if(tokens.length == 2) { return isDouble(tokens[1]); } else { return false; } } private static boolean isRankLine(String line) { String[] tokens = line.split("\t"); if(tokens.length == 5) { return isDouble(tokens[4]); } else if(tokens.length == 2) { return isDouble(tokens[1]); } else { return false; } } private static boolean isTabSeparated(String line) { return line.indexOf("\t") != -1; } public static Type guessEnrichmentType(String path) { return guessEnrichmentType(Paths.get(path)); } /* * This logic was moved here from {@link DetermineEnrichmentResultFileReader} */ public static Type guessEnrichmentType(Path path) { try { String firstLine = com.google.common.io.Files.readFirstLine(path.toFile(), Charset.defaultCharset()); String[] tokens = firstLine.split("\t"); //check to see if there are exactly 11 columns - = GSEA results if(tokens.length == 11) { //check to see if the ES is the 5th column and that NES is the 6th column if((tokens[4].equalsIgnoreCase("ES")) && (tokens[5].equalsIgnoreCase("NES"))) return Type.ENRICHMENT_GSEA; //it is possible that the file can have 11 columns but that it is still a generic file //if it doesn't specify ES and NES in the 5 and 6th columns else return Type.ENRICHMENT_GENERIC; } //check to see if there are exactly 13 columns - = DAVID results else if(tokens.length == 13) { //check to see that the 6th column is called Genes and that the 12th column is called "Benjamini" if((tokens[5].equalsIgnoreCase("Genes")) && tokens[11].equalsIgnoreCase("Benjamini")) return Type.ENRICHMENT_DAVID; else return Type.ENRICHMENT_GENERIC; } //fix bug with new version of bingo plugin change the case of the header file. else if(firstLine.toLowerCase().contains("File created with BiNGO".toLowerCase())) { return Type.ENRICHMENT_BINGO; } else if(firstLine.contains("GREAT version")) { return Type.ENRICHMENT_GREAT; } else { return Type.ENRICHMENT_GENERIC; } } catch(IOException e) { // MKTODO log the exception } return Type.IGNORE; } public static String getDatasetNameGeneric(Path file) { String name = file.getFileName().toString(); if(name.contains(".")) return name.substring(0, name.lastIndexOf('.')); else return name; } }