/**
* ContentScraperTest
* part of YaCy
* Copyright 2016 by luccioman; https://github.com/luccioman
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser.html;
import java.awt.Dimension;
import java.io.IOException;
import java.io.StringReader;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils;
import org.junit.Assert;
import org.junit.Test;
/**
* Unit tests for ContentScrapper class.
* @author luc
*
*/
public class ContentScraperTest {
@Test
public final void testParseSizes() {
/* Normal case */
Set<Dimension> sizes = ContentScraper.parseSizes("96x128");
Assert.assertEquals(1, sizes.size());
Assert.assertTrue(sizes.contains(new Dimension(96, 128)));
/* "any" keyword */
sizes = ContentScraper.parseSizes("any");
Assert.assertEquals(0, sizes.size());
/* Multiple valid sizes, lower and upper case separator */
sizes = ContentScraper.parseSizes("96x128 16X16 1X2 1024x768");
Assert.assertEquals(4, sizes.size());
Assert.assertTrue(sizes.contains(new Dimension(96, 128)));
Assert.assertTrue(sizes.contains(new Dimension(16, 16)));
Assert.assertTrue(sizes.contains(new Dimension(1, 2)));
Assert.assertTrue(sizes.contains(new Dimension(1024, 768)));
/* Duplicate entries */
sizes = ContentScraper.parseSizes("96x128 96X128 1X2 96x128");
Assert.assertEquals(2, sizes.size());
Assert.assertTrue(sizes.contains(new Dimension(96, 128)));
Assert.assertTrue(sizes.contains(new Dimension(1, 2)));
/* Mutiple inner and trailing spaces */
sizes = ContentScraper.parseSizes(" 96x128 16X16 ");
Assert.assertEquals(2, sizes.size());
Assert.assertTrue(sizes.contains(new Dimension(96, 128)));
Assert.assertTrue(sizes.contains(new Dimension(16, 16)));
/* Empty string */
sizes = ContentScraper.parseSizes("");
Assert.assertEquals(0, sizes.size());
/* null string */
sizes = ContentScraper.parseSizes(null);
Assert.assertEquals(0, sizes.size());
/* Invalid sizes */
sizes = ContentScraper.parseSizes("096x0128 -16x-16 0x0 x768 78x axb 1242");
Assert.assertEquals(0, sizes.size());
/* Mix of valid and invalid sizes */
sizes = ContentScraper.parseSizes("96x128 16X16 axb 123 78x32");
Assert.assertEquals(3, sizes.size());
Assert.assertTrue(sizes.contains(new Dimension(96, 128)));
Assert.assertTrue(sizes.contains(new Dimension(16, 16)));
Assert.assertTrue(sizes.contains(new Dimension(78, 32)));
}
@Test
public final void testParseSpaceSeparatedTokens() {
/* Normal case */
Set<String> tokens = ContentScraper.parseSpaceSeparatedTokens("abc de");
Assert.assertEquals(2, tokens.size());
Assert.assertTrue(tokens.contains("abc"));
Assert.assertTrue(tokens.contains("de"));
/* One item only */
tokens = ContentScraper.parseSpaceSeparatedTokens("abc");
Assert.assertEquals(1, tokens.size());
Assert.assertTrue(tokens.contains("abc"));
/* Mutiple inner and trailing spaces */
tokens = ContentScraper.parseSpaceSeparatedTokens(" abc d efff fgj ");
Assert.assertEquals(4, tokens.size());
Assert.assertTrue(tokens.contains("abc"));
Assert.assertTrue(tokens.contains("d"));
Assert.assertTrue(tokens.contains("efff"));
Assert.assertTrue(tokens.contains("fgj"));
/* Duplicate entries */
tokens = ContentScraper.parseSpaceSeparatedTokens("abc bb abc abc ABC");
Assert.assertEquals(3, tokens.size());
Assert.assertTrue(tokens.contains("abc"));
/* ignoring case is not the purpose of this function */
Assert.assertTrue(tokens.contains("ABC"));
Assert.assertTrue(tokens.contains("bb"));
/* Empty string */
tokens = ContentScraper.parseSpaceSeparatedTokens("");
Assert.assertEquals(0, tokens.size());
/* Null string */
tokens = ContentScraper.parseSpaceSeparatedTokens(null);
Assert.assertEquals(0, tokens.size());
}
@Test
public void testGetStartDates() throws MalformedURLException, IOException {
List<Date> dateResultList;
DigestURL root = new DigestURL("http://test.org/test.html");
String page = "<html><body>"
+ "<time datetime='2016-12-23'>23. Dezember 2016</time>" // html5 time tag
+ "</body></html>";
ContentScraper scraper = new ContentScraper(root, 10, new VocabularyScraper(), 0);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new StringReader(page), writer);
writer.close();
dateResultList = scraper.getStartDates();
Calendar cal = Calendar.getInstance();
cal.setTimeInMillis(0); // to zero hours
cal.set(2016, Calendar.DECEMBER, 23);
for (Date d : dateResultList) {
Assert.assertEquals(cal.getTime(), d);
}
scraper.close();
}
}