package eu.dnetlib.iis.wf.affmatching.bucket;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import eu.dnetlib.iis.wf.affmatching.model.AffMatchAffiliation;
import eu.dnetlib.iis.wf.affmatching.model.AffMatchOrganization;
import scala.Tuple2;
/**
* Implementation of {@link AffOrgJoiner} that links {@link AffMatchAffiliation} to {@link AffMatchOrganization} according to
* hashes generated by {@link #setAffiliationBucketHasher(BucketHasher)} and {@link #setOrganizationBucketHasher(BucketHasher)}.
*
* @author Ćukasz Dumiszewski
*/
public class AffOrgHashBucketJoiner implements AffOrgJoiner {
private static final long serialVersionUID = 1L;
private BucketHasher<AffMatchAffiliation> affiliationBucketHasher = new AffiliationOrgNameBucketHasher();
private BucketHasher<AffMatchOrganization> organizationBucketHasher = new OrganizationNameBucketHasher();
//------------------------ LOGIC --------------------------
/**
* Joins the given affiliations to organizations by using hashes generated by {@link #setAffiliationBucketHasher(BucketHasher)} and
* {@link #setOrganizationBucketHasher(BucketHasher)}. Returns pairs of affiliations and organizations that have the same hashes.<br />
*/
@Override
public JavaRDD<Tuple2<AffMatchAffiliation, AffMatchOrganization>> join(JavaRDD<AffMatchAffiliation> affiliations, JavaRDD<AffMatchOrganization> organizations) {
JavaPairRDD<String, AffMatchAffiliation> hashAffiliations = affiliations.mapToPair(aff -> new Tuple2<String, AffMatchAffiliation>(affiliationBucketHasher.hash(aff), aff));
JavaPairRDD<String, AffMatchOrganization> hashOrganizations = organizations.mapToPair(org -> new Tuple2<String, AffMatchOrganization>(organizationBucketHasher.hash(org), org));
JavaPairRDD<String, Tuple2<AffMatchAffiliation, AffMatchOrganization>> hashAffOrgs = hashAffiliations.join(hashOrganizations);
return hashAffOrgs.values();
}
//------------------------ SETTERS --------------------------
/**
* A hasher used to generate hashes for {@link AffMatchAffilation}s.
*/
public void setAffiliationBucketHasher(BucketHasher<AffMatchAffiliation> affiliationBucketHasher) {
this.affiliationBucketHasher = affiliationBucketHasher;
}
/**
* A hasher used to generate hashes for {@link AffMatchOrganization}s.
*/
public void setOrganizationBucketHasher(BucketHasher<AffMatchOrganization> organizationBucketHasher) {
this.organizationBucketHasher = organizationBucketHasher;
}
}