/* * Copyright 2011 Global Biodiversity Information Facility (GBIF) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.gbif.occurrence.util; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.StringReader; import java.util.zip.GZIPInputStream; import org.junit.Test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class XmlSanitizingReaderTest { private String doSingleReads(String test) throws IOException { StringReader reader = new StringReader(test); XmlSanitizingReader xmlReader = new XmlSanitizingReader(reader); StringBuilder sb = new StringBuilder(); while (xmlReader.ready()) { int nextIntChar = xmlReader.read(); char nextChar = (char) nextIntChar; if (nextIntChar != -1) sb.append(nextChar); } String result = sb.toString(); // System.out.println("Single reads result [" + result + "]"); return result; } private String doSimpleBufferRead(String test) throws IOException { StringReader reader = new StringReader(test); XmlSanitizingReader xmlReader = new XmlSanitizingReader(reader); StringBuilder sb = new StringBuilder(); while (xmlReader.ready()) { char[] buffer = new char[12]; xmlReader.read(buffer); sb.append(new String(buffer)); } String result = sb.toString().trim(); // System.out.println("Simple buffer read result [" + result + "]"); return result; } private String doOffsetBufferRead(String test) throws IOException { StringReader reader = new StringReader(test); XmlSanitizingReader xmlReader = new XmlSanitizingReader(reader); StringBuilder sb = new StringBuilder(); int totalCharsRead = 0; while (xmlReader.ready()) { char[] buffer = new char[12]; int charsRead = xmlReader.read(buffer, 0, 12); totalCharsRead += charsRead; sb.append(new String(buffer)); } String result = sb.toString().trim(); // System.out.println("Offset buffer read result [" + result + "]"); return result; } @Test public void testAsciiSingleReads() throws Exception { String test = "No bad chars and no funny chars."; String result = doSingleReads(test); assertTrue(result.equals(test)); } @Test public void testAsciiSimpleBufferRead() throws Exception { String test = "No bad chars and no funny chars."; String result = doSimpleBufferRead(test); assertTrue(result.equals(test)); } @Test public void testAsciiOffsetBufferRead() throws Exception { String test = "No bad chars and no funny chars."; String result = doOffsetBufferRead(test); assertTrue(result.equals(test)); } @Test public void testUtf8SingleReads() throws Exception { String test = "No bad chars and some seriously funny chars: äåáàæœčéèêëïñøöüßšž北京العربية"; String result = doSingleReads(test); assertTrue(result.equals(test)); } @Test public void testUtf8SimpleBufferRead() throws Exception { String test = "No bad chars and some seriously funny chars: äåáàæœčéèêëïñøöüßšž北京العربية"; String result = doSimpleBufferRead(test); assertTrue(result.equals(test)); } @Test public void testUtf8OffsetBufferRead() throws Exception { String test = "No bad chars and some seriously funny chars: äåáàæœčéèêëïñøöüßšž北京العربية"; String result = doOffsetBufferRead(test); assertTrue(result.equals(test)); } @Test public void testBadXmlSingleReads() throws Exception { char bad1 = 0xb; char bad2 = 0x7; char goodWeird = 0xa; String test = "Some bad chars " + bad1 + goodWeird + " and some seriously funny chars: äåáàæœčéèêëïñøöüßšž北京العربية " + bad2 + "end"; String goal = "Some bad chars " + goodWeird + " and some seriously funny chars: äåáàæœčéèêëïñøöüßšž北京العربية end"; String result = doSingleReads(test); assertTrue(result.equals(goal)); } @Test public void testBadXmlSimpleBufferRead() throws Exception { char bad1 = 0xb; char bad2 = 0x7; char goodWeird = 0xa; String test = "Some bad chars " + bad1 + goodWeird + " and some seriously funny chars: äåáàæœčéèêëïñøöüßšž北京العربية " + bad2 + "end"; String goal = "Some bad chars " + goodWeird + " and some seriously funny chars: äåáàæœčéèêëïñøöüßšž北京العربية end"; String result = doSimpleBufferRead(test); assertTrue(result.equals(goal)); } @Test public void testBadXmlOffsetBufferRead() throws Exception { char bad1 = 0xb; char bad2 = 0x7; char goodWeird = 0xa; String test = "Some bad chars " + bad1 + goodWeird + " and some seriously funny chars: äåáàæœčéèêëïñøöüßšž北京العربية " + bad2 + "end"; String goal = "Some bad chars " + goodWeird + " and some seriously funny chars: äåáàæœčéèêëïñøöüßšž北京العربية end"; String result = doOffsetBufferRead(test); assertTrue(result.equals(goal)); } @Test public void testBadXmlFileRead() throws Exception { String fileName = getClass().getResource("/responses/problematic/spanish_bad_xml.gz").getFile(); File file = new File(fileName); FileInputStream fis = new FileInputStream(file); GZIPInputStream inputStream = new GZIPInputStream(fis); StringBuilder sb = new StringBuilder(); XmlSanitizingReader xmlReader = new XmlSanitizingReader(new InputStreamReader(inputStream, "UTF-8")); while (xmlReader.ready()) { char[] buff = new char[8192]; xmlReader.read(buff, 0, 8192); sb.append(buff); } assertEquals(6210, sb.toString().trim().length()); } @Test public void testBadXmlFileReadWithBufferedReaderCharArray() throws Exception { String fileName = getClass().getResource("/responses/problematic/spanish_bad_xml.gz").getFile(); File file = new File(fileName); FileInputStream fis = new FileInputStream(file); GZIPInputStream inputStream = new GZIPInputStream(fis); StringBuilder sb = new StringBuilder(); BufferedReader xmlReader = new BufferedReader(new XmlSanitizingReader(new InputStreamReader(inputStream, "UTF-8"))); while (xmlReader.ready()) { char[] buff = new char[8192]; xmlReader.read(buff, 0, 8192); sb.append(buff); } assertEquals(6210, sb.toString().trim().length()); } @Test public void testBadXmlFileReadWithBufferedReaderReadLines() throws Exception { String fileName = getClass().getResource("/responses/problematic/spanish_bad_xml.gz").getFile(); File file = new File(fileName); FileInputStream fis = new FileInputStream(file); GZIPInputStream inputStream = new GZIPInputStream(fis); StringBuilder sb = new StringBuilder(); BufferedReader buffReader = new BufferedReader(new XmlSanitizingReader(new InputStreamReader(inputStream, "UTF-8"))); while (buffReader.ready()) { String line = buffReader.readLine(); // System.out.println("adding line [" + line + "]"); sb.append(line); } // System.out.println("\nFinal string output is:\n" + sb.toString()); // drops newline chars vs chararray test, above assertEquals(6097, sb.toString().trim().length()); } }