package eu.dnetlib.iis.common.spark.avro;
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.io.Files;
import eu.dnetlib.iis.common.IntegrationTest;
import eu.dnetlib.iis.common.avro.Country;
import eu.dnetlib.iis.common.utils.AvroTestUtils;
import pl.edu.icm.sparkutils.test.SparkJob;
import pl.edu.icm.sparkutils.test.SparkJobBuilder;
import pl.edu.icm.sparkutils.test.SparkJobExecutor;
/**
* @author Ćukasz Dumiszewski
*/
@Category(IntegrationTest.class)
public class AvroSaverTest {
private static final Logger log = LoggerFactory.getLogger(AvroSaverTest.class);
private SparkJobExecutor executor = new SparkJobExecutor();
private static File workingDir;
private static String outputDirPath;
@Before
public void before() {
workingDir = Files.createTempDir();
outputDirPath = workingDir + "/spark_sql_avro_cloner/output";
}
@After
public void after() throws IOException {
FileUtils.deleteDirectory(workingDir);
}
//------------------------ TESTS --------------------------
@Test
public void test() throws IOException {
// given
SparkJob sparkJob = SparkJobBuilder
.create()
.setAppName("Spark Avro Saver Test")
.addJobProperty("spark.driver.host", "localhost")
.setMainClass(AvroSaverTest.class)
.build();
// execute
executor.execute(sparkJob);
// assert
List<Country> countries = AvroTestUtils.readLocalAvroDataStore(outputDirPath);
log.info(countries.toString());
assertEquals(4, countries.size());
assertEquals(1, countries.stream().filter(c->c.getIso().equals("PL")).count());
}
//------------------------ LOGIC --------------------------
public static void main(String[] args) throws IOException {
SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.set("spark.driver.host", "localhost");
try (JavaSparkContext sc = new JavaSparkContext(conf)) {
SQLContext sqlContext = new SQLContext(sc);
DataFrame countries = sqlContext.read().json("src/test/resources/eu/dnetlib/iis/common/avro/countries.json");
// without these 2 lines below there is no guarantee as to the field order and then
// they can be saved not in accordance with avro schema
countries.registerTempTable("countries");
countries = sqlContext.sql("select id, name, iso from countries");
log.info(countries.javaRDD().collect().toString());
AvroSaver.save(countries, Country.SCHEMA$, outputDirPath);
}
}
}