package de.jpaw.bonaparte.batch;
import java.util.SortedSet;
import java.util.TreeSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Joiner;
import de.jpaw.batch.api.BatchProcessor;
import de.jpaw.batch.api.BatchProcessorFactory;
import de.jpaw.batch.impl.ContributorNoop;
import de.jpaw.bonaparte.pojos.meta.AlphanumericElementaryDataItem;
import de.jpaw.bonaparte.pojos.meta.ClassDefinition;
import de.jpaw.bonaparte.pojos.meta.FieldDefinition;
/** A worker which analyzes CSV-type text files and outputs an (improved?) interface description.
* This class can now operate multithreaded, it uses kind of map/reduce to merge the results. */
public class AnalyzerWorkerFactory extends ContributorNoop implements BatchProcessorFactory<String, String> {
private static final Logger LOG = LoggerFactory.getLogger(AnalyzerWorkerFactory.class);
private final static int MAX_DIFFERENT_VALUES = 10; // if reaching this number, we are no longer interested in the specific values
private final ClassDefinition meta;
private final String separator;
private final int [] fieldLengths;
private final int [] calculateLengths() {
if (separator != null) {
return null;
} else {
int [] lengths = new int [meta.getNumberOfFields()];
for (int i = 0; i < meta.getNumberOfFields(); ++i) {
AlphanumericElementaryDataItem f = (AlphanumericElementaryDataItem)meta.getFields().get(i);
lengths[i] = f.getLength();
}
return lengths;
}
}
// separator must be non-null
public AnalyzerWorkerFactory(String separator) {
meta = null;
this.separator = separator;
fieldLengths = calculateLengths();
}
// if separator is null, meta must be a class definition with only Alphanumeric fields.
public AnalyzerWorkerFactory(ClassDefinition meta, String separator) {
this.meta = meta;
this.separator = separator;
fieldLengths = calculateLengths();
}
@Override
public BatchProcessor<String, String> getProcessor(int threadNo) {
return new AnalyzerWorker(this);
}
static private class Range {
public int minVal = 999;
public int maxVal = 0;
public void upd(int myVal) {
if (myVal > maxVal)
maxVal = myVal;
if (myVal < minVal)
minVal = myVal;
}
/** Merges a second range, which may have been computed in a parallel thread. */
public void merge(Range other) {
if (minVal > other.minVal)
minVal = other.minVal;
if (maxVal < other.maxVal)
maxVal = other.maxVal;
}
@Override
public String toString() {
return String.format("[%3d,%3d]", minVal, maxVal);
}
}
static private class Statistics {
public boolean optional = false;
public boolean nonAscii = false;
public boolean nonDigit = false;
public boolean nonUpper = false;
public boolean nonLower = false;
public Range len = new Range();
public Range dots = new Range();
public Range colons = new Range();
public Range slashes = new Range();
public Range minusses = new Range();
public int maxDigitsBeforeDot = 0;
public int maxDigitsAfterDot = 0;
public SortedSet<String> values = new TreeSet<String>();
/** Merges a second statistics entry, which may have been computed in a parallel thread. */
public void merge(Statistics other) {
this.optional = this.optional || other.optional;
this.nonAscii = this.nonAscii || other.nonAscii;
this.nonDigit = this.nonDigit || other.nonDigit;
this.nonUpper = this.nonUpper || other.nonUpper;
this.nonLower = this.nonLower || other.nonLower;
this.len.merge(other.len);
this.dots.merge(other.dots);
this.colons.merge(other.colons);
this.slashes.merge(other.slashes);
this.minusses.merge(other.minusses);
if (this.maxDigitsBeforeDot < other.maxDigitsBeforeDot)
this.maxDigitsBeforeDot = other.maxDigitsBeforeDot;
if (this.maxDigitsAfterDot < other.maxDigitsAfterDot)
this.maxDigitsAfterDot = other.maxDigitsAfterDot;
if (this.values.size() < MAX_DIFFERENT_VALUES) // we don't use it if bigger
this.values.addAll(other.values);
}
}
static private class AnalyzerWorker implements BatchProcessor<String, String> {
private final AnalyzerWorkerFactory myFactory;
private Range numFields = new Range();
private Statistics [] columnData = new Statistics[500];
private int warnForstringsLongerThan = 1000;
private int lastLineWarned = 0;
public AnalyzerWorker(AnalyzerWorkerFactory myFactory) {
this.myFactory = myFactory;
for (int i = 0; i < 500; ++i)
columnData[i] = new Statistics();
}
private void check(int recordNo, Statistics s, String w) {
if (w == null) {
s.optional = true;
return;
}
w = w.trim();
int len = w.length();
if (len == 0) {
s.optional = true;
return;
}
if (len > warnForstringsLongerThan && recordNo > lastLineWarned) {
LOG.info("Line {} contains a field of length {}", recordNo, len);
lastLineWarned = recordNo;
}
s.len.upd(len);
if (s.values.size() < MAX_DIFFERENT_VALUES)
s.values.add(w);
int dots = 0;
int colons = 0;
int slashes = 0;
int minusses = 0;
int digsBeforeDot = 0;
int digsAfterDot = 0;
for (int i = 0; i < len; ++i) {
char c = w.charAt(i);
switch (c) {
case '-':
++minusses;
break;
case '.':
++dots;
break;
case ':':
++colons;
break;
case '/':
++slashes;
break;
default:
if (!Character.isDigit(c))
s.nonDigit = true;
else {
if (dots == 0)
++digsBeforeDot;
else
++digsAfterDot;
}
if (!Character.isLowerCase(c))
s.nonLower = true;
if (!Character.isUpperCase(c))
s.nonUpper = true;
if (c > 0x7f)
s.nonAscii = true;
break;
}
}
s.dots.upd(dots);
s.colons.upd(colons);
s.slashes.upd(slashes);
s.minusses.upd(minusses);
if (digsBeforeDot > s.maxDigitsBeforeDot)
s.maxDigitsBeforeDot = digsBeforeDot;
if (digsAfterDot > s.maxDigitsAfterDot)
s.maxDigitsAfterDot = digsAfterDot;
}
@Override
public String process(int recordNo, String data) throws Exception {
// split the line
String [] cols;
if (myFactory.separator != null) {
cols = data.split(myFactory.separator);
} else {
cols = new String [myFactory.meta.getNumberOfFields()];
int currentPos = 0;
for (int i = 0; i < cols.length; ++i) {
int endPos = currentPos + myFactory.fieldLengths[i];
if (endPos > data.length())
break; // no more fields
cols[i] = data.substring(currentPos, endPos);
currentPos = endPos;
}
}
numFields.upd(cols.length);
for (int i = 0; i < cols.length; ++i)
check(recordNo, columnData[i], cols[i]);
return null;
}
@Override
public void close() throws Exception {
if (numFields.maxVal == 0)
return; // no data received
myFactory.merge(numFields, columnData);
}
}
private Range numFields = null;
private Statistics [] columnData = null;
private Integer mergeLock = new Integer(876511);
private void merge(Range numFields, Statistics [] columnData) {
synchronized(mergeLock) {
if (this.numFields == null) {
// assignment
LOG.info("initial merge");
this.numFields = numFields;
this.columnData = columnData;
} else {
LOG.info("merging result sets");
this.numFields.merge(numFields);
for (int i = 0; i < this.numFields.maxVal; ++i)
this.columnData[i].merge(columnData[i]);
}
}
}
@Override
public void close() throws Exception {
// close is invoked after all processors have been closed (and their results been merged),
// so we can output the overall result now
System.out.println("Column range is " + numFields);
System.out.println("Col OPT Uni Dig Upp Low length dots minusses slashes colons");
for (int i = 0; i < numFields.maxVal; ++i) {
Statistics s = columnData[i];
System.out.println(String.format("%3d %s %s %s %s %s %s %s %s %s %s",
i+1,
b2a(s.optional), b2a(s.nonAscii), b2a(!s.nonDigit), b2a(!s.nonUpper), b2a(!s.nonLower),
s.len, s.dots, s.minusses, s.slashes, s.colons));
}
if (meta != null) {
guessBetterDescription();
}
}
private String b2a(boolean b) {
return b ? "Y" : "N";
}
private String optionalLengthComment(Range len) {
if (len.minVal < len.maxVal)
return String.format("%d .. %d", len.minVal, len.maxVal);
return null;
}
// return the max of the len found and the original len (if the field was an alphanumeric field)
private int getLen(FieldDefinition fld, int myLen) {
if (fld == null || !(fld instanceof AlphanumericElementaryDataItem))
return myLen;
AlphanumericElementaryDataItem afld = (AlphanumericElementaryDataItem)fld;
return afld.getLength() > myLen ? afld.getLength() : myLen;
}
private String listValues(SortedSet<String> samples) {
if (samples.size() >= MAX_DIFFERENT_VALUES || samples.size() == 0)
return ""; // makes no sense to list
if (samples.size() == 1)
return " // constant value " + samples.first();
// return " // values = { " + String.join(s.values) + " }"; // needs Java 1.8
return " // values = { " + Joiner.on(", ").join(samples) + " }"; // guava
}
private void guessBetterDescription() {
System.out.println(" class " + meta.getName() + " {");
for (int i = 0; i < numFields.maxVal; ++i) {
FieldDefinition fld = i < meta.getFields().size() ? meta.getFields().get(i) : null;
Statistics s = columnData[i];
int defLen = getLen(fld, s.len.maxVal);
String type = null;
String comment = null;
if (s.len.maxVal == 0) {
type = String.format("Ascii(%d)", getLen(fld, 1));
comment = "UNUSED, ALWAYS BLANK!";
} else if (s.nonAscii) {
type = String.format("Unicode(%d)", defLen);
comment = optionalLengthComment(s.len);
} else if (s.minusses.maxVal > 0 || s.dots.maxVal > 0 || s.slashes.maxVal > 0 || s.colons.maxVal > 0) {
// signed number or date / timestamp?
if (!s.nonDigit) {
if (s.minusses.maxVal <= 1 && s.dots.maxVal <= 1 && s.slashes.maxVal == 0 && s.colons.maxVal == 0) {
// is a number
if (s.dots.maxVal > 0) {
type = String.format("%sDecimal(%d,%d)", s.minusses.maxVal > 0 ? "signed " : "",
//s.maxDigitsBeforeDot + s.maxDigitsAfterDot,
getLen(fld, s.maxDigitsBeforeDot + s.maxDigitsAfterDot + 1 + s.minusses.maxVal) - 1 - s.minusses.maxVal, // max possible digits minus decimal minus minus
s.maxDigitsAfterDot);
} else {
// int digits = s.maxDigitsBeforeDot;
int digits = getLen(fld, s.maxDigitsBeforeDot + s.minusses.maxVal) - s.minusses.maxVal; // max possible digits minus minus
type = String.format("%s%s(%d)", s.minusses.maxVal > 0 ? "signed " : "",
digits > 9 ? "Decimal" : "Number", // use an int if possible, else a (Big)Decimal
digits
);
}
} else if (s.len.maxVal <= 10) {
if (s.minusses.maxVal == 2 && s.minusses.minVal == 2 && s.dots.maxVal == 0 && s.slashes.maxVal == 0 && s.colons.maxVal == 0) {
type = "Day";
comment = "DIN format?";
} else if (s.dots.maxVal == 2 && s.dots.minVal == 2 && s.minusses.maxVal == 0 && s.slashes.maxVal == 0 && s.colons.maxVal == 0) {
type = "Day";
comment = "German format?";
} else if (s.slashes.maxVal == 2 && s.slashes.minVal == 2 && s.minusses.maxVal == 0 && s.dots.maxVal == 0 && s.colons.maxVal == 0) {
type = "Day";
comment = "US/UK format?";
} else if (s.colons.maxVal == 2 && s.colons.minVal == 2 && s.minusses.maxVal == 0 && s.dots.maxVal <= 1 && s.slashes.maxVal == 0) {
type = String.format("Time%s;", s.dots.maxVal > 0 ? "(3)" : "");
}
}
} // else fall through (ASCII)
} else if (!s.nonUpper) {
type = String.format("Uppercase(%d)", defLen);
comment = optionalLengthComment(s.len);
} else if (!s.nonLower) {
type = String.format("Lowercase(%d)", defLen);
comment = optionalLengthComment(s.len);
} else if (!s.nonDigit) {
type = String.format("%s(%d)", defLen > 18 ? "Number" : defLen > 9 ? "Long" : "Integer", defLen);
comment = "unsigned";
}
if (type == null) {
type = String.format("Ascii(%d)", defLen);
comment = optionalLengthComment(s.len);
}
System.out.println(String.format(" %s %-23s %-23s%s%s",
i >= numFields.minVal || s.optional ? "optional" : "required",
type,
(fld == null ? String.format("extra%03d", i+1) : fld.getName()) + ";",
comment == null ? "" : "// " + comment,
listValues(s.values)
));
}
System.out.println(" }");
}
}