package org.wikibrain.mapper.algorithms.conceptualign3;
import edu.uci.ics.jung.algorithms.cluster.WeakComponentClusterer;
import edu.uci.ics.jung.graph.DirectedSparseGraph;
import org.wikibrain.core.WikiBrainException;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.lang.LocalId;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Created by bjhecht on 5/21/14.
*/
public class ILLSplitter {
private static Logger LOG = LoggerFactory.getLogger(ConceptualignConceptMapper.class);
public static Set<Set<LocalId>> split(Map<LocalId, List<LocalId>> ills,
int minVotes, int maxVotesPerLang, boolean print, LocalPageDao lpDao) throws WikiBrainException {
HashMap<LocalId, SummingHashMap<Integer>> counter = new HashMap<LocalId, SummingHashMap<Integer>>();
HashMap<LocalId, SummingHashMap<Integer>> outCounter = new HashMap<LocalId, SummingHashMap<Integer>>();
HashMap<LocalId, LocalId> outFoundLinks = new HashMap<LocalId, LocalId>();
for (LocalId curSource : ills.keySet()){
outCounter.put(curSource,new SummingHashMap<Integer>());
for(LocalId curDest : ills.get(curSource)){
if (!outCounter.get(curSource).containsKey(curDest.getLanguage().getId())){
outCounter.get(curSource).addValue(new Integer(curDest.getLanguage().getId()), 1.0);
outFoundLinks.put(curSource, curDest);
}else{
if(!outFoundLinks.get(curSource).equals(curDest)){ // prevent duplicates from counting as second links
outCounter.get(curSource).addValue(new Integer(curDest.getLanguage().getId()), 1.0);
}
}
if (!counter.containsKey(curDest)){
counter.put(curDest, new SummingHashMap<Integer>());
}
counter.get(curDest).addValue(new Integer(curSource.getLanguage().getId()), 1.0);
}
}
int edgeCounter = 0;
DirectedSparseGraph<LocalId,Integer> graph = new DirectedSparseGraph<LocalId, Integer>();
for (LocalId curSource : ills.keySet()){
graph.addVertex(curSource);
for (LocalId curDest : ills.get(curSource)){
if (outCounter.get(curSource).get(new Integer(curDest.getLanguage().getId())) <= maxVotesPerLang){
int totalVotes = counter.get(curDest).keySet().size();
if (totalVotes >= minVotes){
if (counter.get(curDest).get(new Integer(curSource.getLanguage().getId())) <= maxVotesPerLang){
graph.addEdge(edgeCounter++, curSource, curDest);
}
}else{
if (print) {
try {
LOG.info("Removing edge: " + lpDao.getById(curSource).getTitle() + " --> " + lpDao.getById(curDest).getTitle());
}catch(DaoException e){
throw new WikiBrainException(e);
}
}
}
}else{
LOG.warn("Found duplicate ILLs to same lang from same article exceeding maxVotes! " +
"Enforcing policy not allowing this!:\t" +curSource + " ---> " + curDest);
}
}
}
WeakComponentClusterer<LocalId, Integer> clusterer = new WeakComponentClusterer<LocalId, Integer>();
Set<Set<LocalId>> clusters = clusterer.transform(graph);
if (print){
int maxSize = 0;
Set<LocalId> maxCluster = null;
for (Set<LocalId> cluster : clusters){
StringBuilder sb = new StringBuilder();
for (LocalId clusterMemb : cluster){
try {
sb.append(lpDao.getById(clusterMemb).getTitle().toString());
sb.append(",");
}catch(DaoException e){
LOG.error("Error while getting title of LocalId: " + clusterMemb.toString());
}
}
LOG.info("Cluster:\t" + sb.toString());
maxSize = (maxSize > cluster.size()) ? maxSize : cluster.size();
maxCluster = (maxSize > cluster.size()) ? maxCluster : cluster;
}
LOG.info("Clusters identified = " + clusters.size());
LOG.info("Maximum Size = " + maxSize);
}
return clusters;
}
}