package io.github.infolis.util;
import java.io.IOException;
import java.util.StringTokenizer;
/**
* Cleans messy text files (text automatically extracted from pdf documents).
*
* @author katarina.boland@gesis.org
* @author farag.saad@gesis.org {@link TextCleaningUtils#removeLineBreaks(String)}
* @version 2014-01-29
*/
public class TextCleaningUtils
{
/**
* Removes control sequences that may have been inserted during automatic text extraction.
*
* @param text the text to clean
* @return the text without control sequences
*/
public static String removeControlSequences(String text)
{
return text.replaceAll("[^\\P{Cc}\\P{Cf}\\P{Co}\\P{Cs}\\P{Cn}\\s]", "");
}
/**
* Resolves hyphenation and re-assembles words at line breaks.
* Tries to not concatenate words connected by "-" coincidentally occurring at a line break.
*
* @param content input text to be processed
* @return input text without hyphenation at line-breaks
* @throws IOException
*/
public static String removeLineBreaks(String content) throws IOException {
// start cleaning the text file
StringTokenizer loop_content = new StringTokenizer(content, " ");
String text_content = "", text_content1 = "", text_content2 = "";
String parsed_content = "";
while (loop_content.hasMoreTokens()) {
text_content = loop_content.nextToken();
int count = text_content.split("\\-", -1).length - 1; int line_break_pos=text_content.indexOf(System.getProperty("line.separator"));
int Bindestrich =text_content.indexOf("-");
// tackle the line break problem
if (text_content.contains("¬")) {
// replace the symbol "¬" with space
text_content1 = text_content.replace("¬", " ");
// remove the produced space by the first step and also remove
// the line break to concatenate the 2 parts of the broken word
text_content2 = text_content1.trim().replaceAll("\\s+", "");
// start concatenate the two parts of the broken word and add
// the line break in the correct place
parsed_content = parsed_content + text_content2 + " "
+ System.getProperty("line.separator");
} else
if (text_content.contains("-")
&& text_content.contains(System
.getProperty("line.separator")) && count <= 1
&& !Character.isDigit(text_content.charAt(0)) && line_break_pos>Bindestrich) {
// replace the symbol "-" with space
text_content1 = text_content.replace('-', ' ');
// remove the produced space by the first step and also remove
// the line break to concatenate the 2 parts of the broken word
text_content2 = text_content1.trim().replaceAll("\\s+", "");
// start concatenate the two parts of the broken word and add
// the line break in the correct place
parsed_content = parsed_content + text_content2 + " "
+ System.getProperty("line.separator");
}
else
// case of more than one "-" and in same time with linebreak
if (text_content.contains("-")
&& text_content.contains(System
.getProperty("line.separator")) && count > 1
&& !Character.isDigit(text_content.charAt(0)) && line_break_pos>Bindestrich) {
int k=text_content.lastIndexOf("-");
text_content1 = text_content.substring(0,k)+' '+text_content.substring(k+1);
// remove the produced space by the first step and also remove
// the line break to concatenate the 2 parts of the broken word
text_content2 = text_content1.trim().replaceAll("\\s+", "");
// start concatenate the two parts of the broken word and add
// the line break in the correct place
parsed_content = parsed_content + text_content2 + " "
+ System.getProperty("line.separator");
}
else
// if no broken word just
parsed_content = parsed_content + text_content + " ";
}
return parsed_content;
}
}