/* * This file is part of JBIRCH. * * JBIRCH is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * JBIRCH is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with JBIRCH. If not, see <http://www.gnu.org/licenses/>. * */ /* * CFNode.java * Copyright (C) 2009 Roberto Perdisci (roberto.perdisci@gmail.com) */ package org.streaminer.stream.clustering.birch; import java.util.*; /** * * @author Roberto Perdisci (roberto.perdisci@gmail.com) * @version 0.1 * */ public class CFNode { private static final String LINE_SEP = System.getProperty("line.separator"); private ArrayList<CFEntry> entries = null; // stores the CFEntries for this node private int maxNodeEntries = 0; // max number of entries per node (parameter B) private double distThreshold = 0; // the distance threshold (parameter T), a.k.a. "radius" private int distFunction = CFTree.D0_DIST; // the distance function to use private boolean leafStatus = false; // if true, this is a leaf private CFNode nextLeaf = null; // pointer to the next leaf (if not a leaf, pointer will be null) private CFNode previousLeaf = null; // pointer to the previous leaf (if not a leaf, pointer will be null) private boolean applyMergingRefinement = false; // if true, merging refinement will be applied after every split public CFNode(int maxNodeEntries, double distThreshold, int distFunction, boolean applyMergingRefinement, boolean leafStatus) { this.maxNodeEntries = maxNodeEntries; this.distThreshold = distThreshold; this.distFunction = distFunction; this.entries = new ArrayList<CFEntry>(maxNodeEntries); this.leafStatus = leafStatus; this.applyMergingRefinement = applyMergingRefinement; } /** * * @return the number of CFEntries in the node */ public int size() { return entries.size(); } /** * * @return true if this is only a place-holder node for maintaining correct pointers in the list of leaves */ public boolean isDummy() { return (maxNodeEntries==0 && distThreshold==0 && this.size()==0 && (previousLeaf!=null || nextLeaf!=null)); } /** * * @return the max number of entries the node can host (parameter B) */ public int getMaxNodeEntries() { return maxNodeEntries; } /** * * @return the distance threshold used to decide whether a CFEntry can absorb a new entry */ public double getDistThreshold() { return distThreshold; } public int getDistFunction() { return distFunction; } protected CFNode getNextLeaf() { return nextLeaf; } protected CFNode getPreviousLeaf() { return previousLeaf; } protected void addToEntryList(CFEntry e) { this.entries.add(e); } protected ArrayList<CFEntry> getEntries() { return this.entries; } /** * Retrieves the subcluster id of the closest leaf entry to e * * @param e the entry to be mapped * @return a positive integer, if the leaf entries were enumerated after data insertion is finished, otherwise -1 */ public int mapToClosestSubcluster(CFEntry e) { CFEntry closest = findClosestEntry(e); if(!closest.hasChild()) return closest.getSubclusterID(); return closest.getChild().mapToClosestSubcluster(e); } /** * Inserts a new entry to the CFTree * * @param e the entry to be inserted * @return TRUE if the new entry could be inserted without problems, otherwise we need to split the node */ public boolean insertEntry(CFEntry e) { if(entries.size()==0) { // if the node is empty we can insert the entry directly here entries.add(e); return true; // insert was successful. no split necessary } CFEntry closest = findClosestEntry(e); // System.out.println("Closest Entry = " + closest); boolean dontSplit = false; if(closest.hasChild()) { // if closest has a child we go down with a recursive call dontSplit = closest.getChild().insertEntry(e); if(dontSplit) { closest.update(e); // this updates the CF to reflect the additional entry return true; } else { // if the node below /closest/ didn't have enough room to host the new entry // we need to split it CFEntryPair splitPair = splitEntry(closest); // after adding the new entries derived from splitting /closest/ to this node, // if we have more than maxEntries we return false, // so that the parent node will be split as well to redistribute the "load" if(entries.size()>maxNodeEntries) { return false; } else { // splitting stops at this node if(applyMergingRefinement) // performs step 4 of insert process (see BIRCH paper, Section 4.3) mergingRefinement(splitPair); return true; } } } else if(closest.isWithinThreshold(e, distThreshold, distFunction)) { // if dist(closest,e) <= T, /e/ will be "absorbed" by /closest/ closest.update(e); return true; // no split necessary at the parent level } else if(entries.size()<maxNodeEntries) { // if /closest/ does not have children, and dist(closest,e) > T // if there is enough room in this node, we simply add e to it entries.add(e); return true; // no split necessary at the parent level } else { // not enough space on this node entries.add(e); // adds it momentarily to this node return false; // returns false so that the parent entry will be split } } /** * * @param closest the entry to be split * @return the new entries derived from splitting */ public CFEntryPair splitEntry(CFEntry closest) { // IF there was a child, but we could not insert the new entry without problems THAN // split the child of closest entry CFNode oldNode = closest.getChild(); ArrayList<CFEntry> oldEntries = closest.getChild().getEntries(); CFEntryPair p = findFarthestEntryPair(oldEntries); CFEntry newEntry1 = new CFEntry(); CFNode newNode1 = new CFNode(maxNodeEntries,distThreshold,distFunction,applyMergingRefinement,oldNode.isLeaf()); newEntry1.setChild(newNode1); CFEntry newEntry2 = new CFEntry(); CFNode newNode2 = new CFNode(maxNodeEntries,distThreshold,distFunction,applyMergingRefinement,oldNode.isLeaf()); newEntry2.setChild(newNode2); if(oldNode.isLeaf()) { // we do this to preserve the pointers in the leafList CFNode prevL = oldNode.getPreviousLeaf(); CFNode nextL = oldNode.getNextLeaf(); /* DEBUGGING STUFF... System.out.println(">>>>>>>>>>>>>>>>>> SPLIT <<<<<<<<<<<<<<<<<<<<"); System.out.println("PREVL : " + prevL); System.out.println("NEXTL : " + nextL); */ if(prevL!=null) { prevL.setNextLeaf(newNode1); } if(nextL!=null) { nextL.setPreviousLeaf(newNode2); } newNode1.setPreviousLeaf(prevL); newNode1.setNextLeaf(newNode2); newNode2.setPreviousLeaf(newNode1); newNode2.setNextLeaf(nextL); } redistributeEntries(oldEntries,p,newEntry1,newEntry2); // redistributes the entries in n between newEntry1 and newEntry2 // according to the distance to p.e1 and p.e2 entries.remove(closest); // this will be substitute by two new entries entries.add(newEntry1); entries.add(newEntry2); CFEntryPair newPair = new CFEntryPair(newEntry1, newEntry2); /* DEBUGGING STUFF... if(oldNode.isLeaf()) { System.out.println(">>>>>>>>>>>>>>>>>> ---- <<<<<<<<<<<<<<<<<<<<"); System.out.println("PREVL : " + newNode1.getPreviousLeaf()); System.out.println("N1 : " + newNode1); System.out.println("N1.NEXT : " + newNode1.getNextLeaf()); System.out.println("N2 : " + newNode2); System.out.println("N2.PREV : " + newNode2.getPreviousLeaf()); System.out.println("NEXTL : " + newNode2.getNextLeaf()); System.out.println(">>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<"); } */ return newPair; } /** * Called when splitting is necessary * * @param oldEntries * @param farEntries * @param newE1 * @param newE2 */ protected void redistributeEntries(ArrayList<CFEntry> oldEntries, CFEntryPair farEntries, CFEntry newE1, CFEntry newE2) { for(CFEntry e : oldEntries) { double dist1 = farEntries.e1.distance(e,distFunction); double dist2 = farEntries.e2.distance(e,distFunction); if(dist1<=dist2) { newE1.addToChild(e); newE1.update(e); } else { newE2.addToChild(e); newE2.update(e); } } } /** * Called when "merging refinement" is attempted but no actual merging can be applied * * @param oldEntries1 * @param oldEntries2 * @param cloePair * @param e1 * @param e2 */ protected void redistributeEntries(ArrayList<CFEntry> oldEntries1, ArrayList<CFEntry> oldEntries2, CFEntryPair closeEntries, CFEntry newE1, CFEntry newE2) { ArrayList<CFEntry> v = new ArrayList<CFEntry>(); v.addAll(oldEntries1); v.addAll(oldEntries2); for(CFEntry e : v) { double dist1 = closeEntries.e1.distance(e,distFunction); double dist2 = closeEntries.e2.distance(e,distFunction); if(dist1<=dist2) { if(newE1.getChildSize()<maxNodeEntries) { newE1.addToChild(e); newE1.update(e); } else { newE2.addToChild(e); newE2.update(e); } } else if(dist2<dist1) { if(newE2.getChildSize()<maxNodeEntries) { newE2.addToChild(e); newE2.update(e); } else { newE1.addToChild(e); newE1.update(e); } } } } /** * Called when "merging refinement" is attempted and two entries are actually merged * * @param oldEntries1 * @param oldEntries2 * @param cloePair * @param e1 * @param e2 */ protected void redistributeEntries(ArrayList<CFEntry> oldEntries1, ArrayList<CFEntry> oldEntries2, CFEntry newE) { ArrayList<CFEntry> v = new ArrayList<CFEntry>(); v.addAll(oldEntries1); v.addAll(oldEntries2); for(CFEntry e : v) { newE.addToChild(e); newE.update(e); } } /** * * @param e a CFEntry * @return the entry in this node that is closest to e */ protected CFEntry findClosestEntry(CFEntry e) { double minDist = Double.MAX_VALUE; CFEntry closest = null; for(CFEntry c : entries) { double d = c.distance(e,distFunction); if(d<minDist) { minDist = d; closest = c; } } return closest; } protected CFEntryPair findFarthestEntryPair(ArrayList<CFEntry> entries) { if(entries.size()<2) return null; double maxDist = -1; CFEntryPair p = new CFEntryPair(); for(int i=0; i<entries.size()-1; i++) { for(int j=i+1; j<entries.size(); j++) { CFEntry e1 = entries.get(i); CFEntry e2 = entries.get(j); double dist = e1.distance(e2,distFunction); if(dist>maxDist) { p.e1 = e1; p.e2 = e2; maxDist = dist; } } } return p; } protected CFEntryPair findClosestEntryPair(ArrayList<CFEntry> entries) { if(entries.size()<2) return null; // not possible to find a valid pair double minDist = Double.MAX_VALUE; CFEntryPair p = new CFEntryPair(); for(int i=0; i<entries.size()-1; i++) { for(int j=i+1; j<entries.size(); j++) { CFEntry e1 = entries.get(i); CFEntry e2 = entries.get(j); double dist = e1.distance(e2,distFunction); if(dist<minDist) { p.e1 = e1; p.e2 = e2; minDist = dist; } } } return p; } /** * Used during merging refinement * * @param p * @param newE1 * @param newE2 */ private void replaceClosestPairWithNewEntries(CFEntryPair p, CFEntry newE1, CFEntry newE2) { for(int i=0; i<this.entries.size(); i++) { if(this.entries.get(i).equals(p.e1)) this.entries.set(i, newE1); else if(this.entries.get(i).equals(p.e2)) this.entries.set(i, newE2); } } /** * Used during merging refinement * * @param p * @param newE */ private void replaceClosestPairWithNewMergedEntry(CFEntryPair p, CFEntry newE) { for(int i=0; i<this.entries.size(); i++) { if(this.entries.get(i).equals(p.e1)) this.entries.set(i, newE); else if(this.entries.get(i).equals(p.e2)) this.entries.remove(i); } } /** * * @param splitEntries the entry that got split * */ public void mergingRefinement(CFEntryPair splitEntries) { // System.out.println(">>>>>>>>>>>>>>> Merging Refinement <<<<<<<<<<<<"); // System.out.println(splitEntries.e1); // System.out.println(splitEntries.e2); ArrayList<CFEntry> nodeEntries = this.entries; CFEntryPair p = findClosestEntryPair(nodeEntries); if(p==null) // not possible to find a valid pair return; if(p.equals(splitEntries)) return; // if the closet pair is the one that was just split, we terminate CFNode oldNode1 = p.e1.getChild(); CFNode oldNode2 = p.e2.getChild(); ArrayList<CFEntry> oldNode1Entries = oldNode1.getEntries(); ArrayList<CFEntry> oldNode2Entries = oldNode2.getEntries(); if(oldNode1.isLeaf() != oldNode2.isLeaf()) { // just to make sure everything is going ok System.err.println("ERROR: Nodes at the same level must have same leaf status"); System.exit(2); } if((oldNode1Entries.size() + oldNode2Entries.size()) > maxNodeEntries) { // the two nodes cannot be merged into one (they will not fit) // in this case we simply redistribute them between p.e1 and p.e2 CFEntry newEntry1 = new CFEntry(); // note: in the CFNode construction below the last parameter is false // because a split cannot happen at the leaf level // (the only exception is when the root is first split, but that's treated separately) CFNode newNode1 = oldNode1; newNode1.resetEntries(); newEntry1.setChild(newNode1); CFEntry newEntry2 = new CFEntry(); CFNode newNode2 = oldNode2; newNode2.resetEntries(); newEntry2.setChild(newNode2); redistributeEntries(oldNode1Entries, oldNode2Entries, p, newEntry1, newEntry2); replaceClosestPairWithNewEntries(p, newEntry1, newEntry2); } else { // if the the two closest entries can actually be merged into one single entry CFEntry newEntry = new CFEntry(); // note: in the CFNode construction below the last parameter is false // because a split cannot happen at the leaf level // (the only exception is when the root is first split, but that's treated separately) CFNode newNode = new CFNode(maxNodeEntries,distThreshold,distFunction,applyMergingRefinement,oldNode1.isLeaf()); newEntry.setChild(newNode); redistributeEntries(oldNode1Entries, oldNode2Entries, newEntry); if(oldNode1.isLeaf() && oldNode2.isLeaf()) { // this is done to maintain proper links in the leafList if(oldNode1.getPreviousLeaf()!=null) oldNode1.getPreviousLeaf().setNextLeaf(newNode); if(oldNode1.getNextLeaf()!=null) oldNode1.getNextLeaf().setPreviousLeaf(newNode); newNode.setPreviousLeaf(oldNode1.getPreviousLeaf()); newNode.setNextLeaf(oldNode1.getNextLeaf()); // this is a dummy node that is only used to maintain proper links in the leafList // no CFEntry will ever point to this leaf CFNode dummy = new CFNode(0,0,0,false,true); if(oldNode2.getPreviousLeaf()!=null) oldNode2.getPreviousLeaf().setNextLeaf(dummy); if(oldNode2.getNextLeaf()!=null) oldNode2.getNextLeaf().setPreviousLeaf(dummy); dummy.setPreviousLeaf(oldNode2.getPreviousLeaf()); dummy.setNextLeaf(oldNode2.getNextLeaf()); } replaceClosestPairWithNewMergedEntry(p, newEntry); } // merging refinement is done } /** * Substitutes the entries in this node with the entries of the parameter node * * @param n the node from which entries are copied */ private void replaceEntries(CFNode n) { this.entries = n.entries; } private void resetEntries() { this.entries = new ArrayList<CFEntry>(); } public boolean isLeaf() { return this.leafStatus; } /** * * @return true if merging refinement is enabled */ public boolean applyMergingRefinement() { return this.applyMergingRefinement; } protected void setLeafStatus(boolean status) { this.leafStatus = status; } protected void setNextLeaf(CFNode l) { this.nextLeaf = l; } protected void setPreviousLeaf(CFNode l) { this.previousLeaf = l; } protected int countChildrenNodes() { int n=0; for(CFEntry e : this.entries) { if(e.hasChild()) { n++; n += e.getChild().countChildrenNodes(); } } return n; } protected int countEntriesInChildrenNodes() { int n=0; for(CFEntry e : this.entries) { if(e.hasChild()) { n += e.getChild().size(); n += e.getChild().countChildrenNodes(); } } return n; } public String toString() { StringBuffer buff = new StringBuffer(); buff.append("==============================================" + LINE_SEP); if(this.isLeaf()) buff.append(">>> THIS IS A LEAF " + LINE_SEP); buff.append("Num of Entries = " + entries.size() + LINE_SEP); buff.append("{"); for(CFEntry e : entries) { buff.append("[" + e + "]"); } buff.append("}" + LINE_SEP); buff.append("==============================================" + LINE_SEP); return buff.toString(); } }