/*
* Copyright (c) 2013 Oculus Info Inc.
* http://www.oculusinfo.com/
*
* Released under the MIT License.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package com.oculusinfo.tilegen.graph.util;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.parsers.SAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* This class handles reading in a graphml file, parses nodes and edges as desired,
* and re-saves the data in tab-delimited format.
*
* The class constructor takes in an argument map of key/values that is used to configure
* how the data is parsed:
*
* in -- Path and filename of graphML input file [required].
*
* out -- Path and filename of tab-delimited output file [required].
*
* -longIDs -- [boolean, optional] If == true, then nodes will be assigned a unique Long number ID,
* regardless of the node ID format in the original graphML file. Note, this ID convention is
* needed for data processing with Spark's GraphX library. Default == false.
*
* nAttr -- Node attributes to parse (attribute ID tags separated by commas) [optional].
* Default is to parse all node attributes.
*
* eAttr -- Edge attributes to parse (attribute ID tags separated by commas) [optional].
* Default is to parse all edge attributes.
*
* nCoordAttr -- Node attributes to use for node co-ordinates (separated by commas) [optional].
* Default is NO co-ordinate data will be associated with a given node.
*
* nCoordConvert -- Node co-ordinate conversion [optional, may be used with 'nCoordAttr' property above]
* Choices are: zorder2xy (z-order to x-y), zorder2xyz (z-order to x,y,z -- note: z-axis data will be discarded!)
* Default is no conversion
*
**/
public class GraphmlParser {
private List<String> _nodeAttributes = null;
private List<String> _edgeAttributes = null;
private List<String> _nodeCoordAttr = null;
private String _graphmlInput = null;
private String _filenameOut = null;
private String _filenameOutReadme = null;
private String _nodeCoordConvert = null;
private HashMap<String, double[]> _nodemap = new HashMap<String, double[]>();
private long _numNodes = 0;
private long _numEdges = 0;
private boolean _bLongIDs = false;
private BufferedWriter _outBuffWriter;
private BufferedWriter _outBuffWriterReadme;
private ArrayList<String> nodeAttrList = new ArrayList<String>();
private ArrayList<String> edgeAttrList = new ArrayList<String>();
//-----------
public GraphmlParser(HashMap<String, String> argMap) {
_graphmlInput = argMap.get("in"); //in, Path and filename of graphML input file
_filenameOut = argMap.get("out"); //out, Path and filename of tab-delimited output file
_filenameOutReadme = _filenameOut + "_readme";
String stringTemp = argMap.get("nAttr"); //nAttr, "Node attributes to parse (attribute ID tags separated by commas)"
if (stringTemp!=null) { // Default = parse all existing attributes.
_nodeAttributes = Arrays.asList(stringTemp.split(","));
for (int n=0; n<_nodeAttributes.size(); n++) {
_nodeAttributes.set(n, _nodeAttributes.get(n).trim());
}
}
stringTemp = argMap.get("eAttr"); //eAttr, "Edge attributes to parse (attribute ID tags separated by commas)."
if (stringTemp!=null) { // Default = parse all existing attributes.
_edgeAttributes = Arrays.asList(stringTemp.split(","));
for (int n=0; n<_edgeAttributes.size(); n++) {
_edgeAttributes.set(n, _edgeAttributes.get(n).trim());
}
}
stringTemp = argMap.get("nCoordAttr"); //nCoordAttr, "Node attributes to use for node coords (separated by commas)"
if (stringTemp!=null) {
_nodeCoordAttr = Arrays.asList(stringTemp.split(","));
for (int n=0; n<_nodeCoordAttr.size(); n++) {
_nodeCoordAttr.set(n, _nodeCoordAttr.get(n).trim());
}
}
_nodeCoordConvert = argMap.get("nCoordConvert"); //"nCoordConvert", co-ordinate conversion
// Choices are: zorder2xy, zorder2xyz (Note: z-axis data will be discarded!)
// Default = no conversion
_bLongIDs = argMap.get("longIDs").equals("true"); //"longIDs", assign unique Long IDs for each node
}
//-----------
public void parseGraphML() {
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
FileWriter fstream;
fstream = new FileWriter(_filenameOut);
_outBuffWriter = new BufferedWriter(fstream);
DefaultHandler handler = new DefaultHandler() {
ArrayList<String> nodeAttrValues = new ArrayList<String>();
ArrayList<String> edgeAttrValues = new ArrayList<String>();
boolean bInNode = false;
boolean bInEdge = false;
boolean bInData = false;
String nodeID;
String edgeSource;
String edgeTarget;
String dataKey;
//--------------
public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
if (qName.equalsIgnoreCase("key")) {
if (attributes.getValue("for").equalsIgnoreCase("node")) { // store all attribute ID's for nodes
if (_nodeAttributes==null) {
nodeAttrList.add(attributes.getValue("id"));
nodeAttrValues.add("");
} else {
if (_nodeAttributes.contains(attributes.getValue("id"))) {
nodeAttrList.add(attributes.getValue("id"));
nodeAttrValues.add("");
}
}
}
else if (attributes.getValue("for").equalsIgnoreCase("edge")) { // store all attribute ID's for edges
if (_edgeAttributes==null) {
edgeAttrList.add(attributes.getValue("id"));
edgeAttrValues.add("");
} else {
for (int n=0; n<_edgeAttributes.size(); n++) {
if (_edgeAttributes.contains(attributes.getValue("id"))) {
edgeAttrList.add(attributes.getValue("id"));
edgeAttrValues.add("");
}
}
}
}
}
else if (qName.equalsIgnoreCase("node")) {
bInNode = true;
nodeID = attributes.getValue("id");
}
else if (qName.equalsIgnoreCase("edge")) {
bInEdge = true;
edgeSource = attributes.getValue("source");
edgeTarget = attributes.getValue("target");
}
else if (qName.equalsIgnoreCase("data")) {
bInData = true;
dataKey = attributes.getValue("key");
}
}
//--------------
public void endElement(String uri, String localName, String qName) throws SAXException {
if (qName.equalsIgnoreCase("node")) {
bInNode = false;
double[] nodeData = new double[3];
if (_nodeCoordAttr!=null) {
for (int n=0; n<_nodeCoordAttr.size(); n++) {
String coordTemp = nodeAttrValues.get(nodeAttrList.indexOf(_nodeCoordAttr.get(n)));
nodeData[n] = Double.parseDouble(coordTemp);
}
if (_nodeCoordConvert!=null) {
if (_nodeCoordConvert.equals("zorder2xy")) {
Integer[] coordsTmp = mortonXY((long)nodeData[0]); //assume z-order value has been parsed into coords[0]
nodeData[0] = (double)coordsTmp[0];
nodeData[1] = (double)coordsTmp[1];
}
else if (_nodeCoordConvert.equals("zorder2xyz")) {
Integer[] coordsTmp = mortonXYZ((long)nodeData[0]); //assume z-order value has been parsed into coords[0]
nodeData[0] = (double)coordsTmp[0];
nodeData[1] = (double)coordsTmp[1];
//discard z-axis coordinates for now ...
//coords[2] = (double)coordsTmp[2];
}
}
if (_bLongIDs) {
nodeData[2] = (double)_numNodes; // also save current _numNodes values as a unique Long ID for this node
}
_nodemap.put(nodeID, nodeData);
}
else if (_bLongIDs) {
nodeData[2] = (double)_numNodes; // save current _numNodes values as a unique Long ID for this node
_nodemap.put(nodeID, nodeData);
}
_numNodes++;
//write an output line here (tab-delimited)
try {
if (_nodeCoordAttr!=null) {
if (_bLongIDs) {
// write out the unique Long ID for this node as well as the 'original' nodeID
_outBuffWriter.write("node" + "\t" + ((long)nodeData[2]) + "\t" + nodeID + "\t" + nodeData[0] + "\t" + nodeData[1]);
}
else {
_outBuffWriter.write("node" + "\t" + nodeID + "\t" + nodeData[0] + "\t" + nodeData[1]);
}
for (int i=0; i<nodeAttrValues.size(); i++) {
if (!_nodeCoordAttr.contains(nodeAttrList.get(i))) {
_outBuffWriter.write("\t" + nodeAttrValues.get(i)); // write this attribute if not already written out as coords above
}
nodeAttrValues.set(i, "");
}
_outBuffWriter.write("\n");
}
else {
if (_bLongIDs) {
// write out the unique Long ID for this node as well as the 'original' nodeID
_outBuffWriter.write("node" + "\t" + ((long)nodeData[2]) + "\t" + nodeID);
}
else {
_outBuffWriter.write("node" + "\t" + nodeID);
}
for (int i=0; i<nodeAttrValues.size(); i++) {
_outBuffWriter.write("\t" + nodeAttrValues.get(i));
nodeAttrValues.set(i, "");
}
_outBuffWriter.write("\n");
}
} catch (IOException e) {
e.printStackTrace();
}
if ((_numNodes % 100000L == 0) && (_numNodes != 0)) { // print message every 100,000 iterations
System.out.println("Number of nodes = " + _numNodes);
}
}
else if (qName.equalsIgnoreCase("edge")) {
bInEdge = false;
_numEdges++;
//write an output line here (tab-delimited)
try {
if (_nodeCoordAttr!=null) {
double[] coordSrc = _nodemap.get(edgeSource);
double[] coordTar = _nodemap.get(edgeTarget);
if (_bLongIDs) {
// write out unique Long IDs for edge source and dest for edges instead of 'original' node IDs
_outBuffWriter.write("edge" + "\t" + (long)coordSrc[2] + "\t" + coordSrc[0] + "\t" + coordSrc[1] + "\t" +
(long)coordTar[2] + "\t" + coordTar[0] + "\t" + coordTar[1]);
}
else {
_outBuffWriter.write("edge" + "\t" + edgeSource + "\t" + coordSrc[0] + "\t" + coordSrc[1] + "\t" +
edgeTarget + "\t" + coordTar[0] + "\t" + coordTar[1]);
}
for (int i=0; i<edgeAttrValues.size(); i++) {
_outBuffWriter.write("\t" + edgeAttrValues.get(i));
edgeAttrValues.set(i, "");
}
_outBuffWriter.write("\n");
}
else {
if (_bLongIDs) {
// write out unique Long IDs for edge source and dest for edges instead of 'original' node IDs
double[] dataSrc = _nodemap.get(edgeSource);
double[] dataTar = _nodemap.get(edgeTarget);
_outBuffWriter.write("edge" + "\t" + (long)dataSrc[2] + "\t" + (long)dataTar[2]);
}
else {
_outBuffWriter.write("edge" + "\t" + edgeSource + "\t" + edgeTarget);
}
for (int i=0; i<edgeAttrValues.size(); i++) {
_outBuffWriter.write("\t" + edgeAttrValues.get(i));
edgeAttrValues.set(i, "");
}
_outBuffWriter.write("\n");
}
} catch (IOException e) {
e.printStackTrace();
}
if ((_numEdges % 100000L == 0) && (_numEdges != 0)) { // print message every 100,000 iterations
System.out.println("Number of edges = " + _numEdges);
}
}
else if (qName.equalsIgnoreCase("data")) {
bInData = false;
}
}
//--------------
public void characters(char ch[], int start, int length)
throws SAXException {
if (bInData) {
String strData = new String(ch, start, length);
if (bInNode) {
if (nodeAttrList.contains(dataKey)) //save node attribute value to map
nodeAttrValues.set(nodeAttrList.indexOf(dataKey), strData);
}
else if (bInEdge) {
if (edgeAttrList.contains(dataKey)) //save edge attribute value to map
edgeAttrValues.set(edgeAttrList.indexOf(dataKey), strData);
}
}
}
};
saxParser.parse(_graphmlInput, handler);
_outBuffWriter.close();
} catch (Exception e) {
e.printStackTrace();
}
try {
// save column labels to a readme file
FileWriter fstream;
fstream = new FileWriter(_filenameOutReadme);
_outBuffWriterReadme = new BufferedWriter(fstream);
_outBuffWriterReadme.write("This readme file contains column labels for tab-delimited graph data in \n" + _filenameOut + "\n\n");
_outBuffWriterReadme.write("Column labels for nodes are as follows:\n\n");
if (_bLongIDs) {
// write out the unique Long ID for this node as well as the 'original' nodeID
_outBuffWriterReadme.write("node\tnode ID\toriginal node ID");
}
else {
_outBuffWriterReadme.write("node\tnode ID");
}
for (int i=0; i<nodeAttrList.size(); i++) {
_outBuffWriterReadme.write("\t" + nodeAttrList.get(i));
}
_outBuffWriterReadme.write("\n\n");
_outBuffWriterReadme.write("Column labels for edges are as follows:\n\n");
_outBuffWriterReadme.write("edge\tsource ID\tdestination ID");
for (int i=0; i<edgeAttrList.size(); i++) {
_outBuffWriterReadme.write("\t" + edgeAttrList.get(i));
}
_outBuffWriterReadme.write("\n");
_outBuffWriterReadme.close();
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("");
System.out.println("Total number of nodes = " + _numNodes);
System.out.println("Total number of edges = " + _numEdges);
System.out.println("");
System.out.println("Results saved at " + _filenameOut);
System.out.println("Column labels saved at " + _filenameOutReadme);
}
//--------------------------------------------
// Decode Morton z-order number to X, Y, Z co-ordinates
// (from http://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/)
private Integer[] mortonXYZ(long zOrder) {
Integer[] coords = new Integer[3];
coords[0] = compact1By2(zOrder);
coords[1] = compact1By2(zOrder>>1);
coords[2] = compact1By2(zOrder>>2);
return coords;
}
//--------------------------------------------
// Decode Morton z-order number to X and Y co-ordinates
// (from http://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/)
private Integer[] mortonXY(long zOrder) {
Integer[] coords = new Integer[2];
coords[0] = compact1By1(zOrder);
coords[1] = compact1By1(zOrder>>1);
return coords;
}
private int compact1By2(long x)
{
x &= 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
x = (x ^ (x >> 2)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
x = (x ^ (x >> 4)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
x = (x ^ (x >> 8)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
x = (x ^ (x >> 16)) & 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
return (int)x;
}
private int compact1By1(long x)
{
x &= 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
x = (x ^ (x >> 1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
x = (x ^ (x >> 2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
x = (x ^ (x >> 4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
x = (x ^ (x >> 8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
return (int)x;
}
}