package org.seqcode.genome.location;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.LinkedList;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.seqcode.genome.Genome;
import org.seqcode.genome.location.ChromosomeInfo;
import org.seqcode.gseutils.Saveable;
import org.seqcode.math.stats.StatUtil;
/**
* A <code>Region</code> represents an interval along some chromosome in a genome <br>
* <i>Note</i>: We assume 1-based, inclusive coordinate.
*/
public class Region implements Comparable<Region>, Saveable {
/**
* regex for matching a region: newline-whitespace-one or more word chars or
* digits-:-whitespace- one or more digits or commas-an optional m, M, k, or
* K-whitespace- hyphen-whitespace-one or more digits or commas-an optional m,
* M, k, or K- whitespace
*/
public static final String COMPLETE_REGION_REG_EX = "^\\s*([\\w\\d]+):\\s*([,\\d]+[mMkK]?)\\s*-\\s*([,\\d]+[mMkK]?)\\s*";
private static final Pattern REGION_PATTERN = Pattern.compile(COMPLETE_REGION_REG_EX);
// private static final Matcher REGION_MATCHER = REGION_PATTERN.matcher("");
/**
* regex for matching a point: newline-whitespace-one or more word chars or
* digits-:-whitespace- one or more digits or commas-an optional m, M, k, or
* K-whitespace
*/
public static final String POINT_REGION_REG_EX = "^\\s*([\\w\\d]+):\\s*([,\\d]+[mMkK]?)\\s*";
private static final Pattern POINT_PATTERN = Pattern.compile(POINT_REGION_REG_EX);
// private static final Matcher POINT_MATCHER = POINT_PATTERN.matcher("");
/**
* regex for matching a chromosome: newline-whitespace-one or more word chars
* or digits-whitespace
*/
public static final String CHROM_REGION_REG_EX = "^\\s*([\\w\\d]+)\\s*";
private static final Pattern CHROM_PATTERN = Pattern.compile(CHROM_REGION_REG_EX);
// private static final Matcher CHROM_MATCHER = CHROM_PATTERN.matcher("");
/**
* The genome that this region corresponds to
*/
private Genome g;
/**
* The chromosome that this region corresponds to
*/
private String chrom;
/**
* The start of this region (in bp)<br>
* <i>Note</i>: The chromosome starts at 1
*/
private int start;
/**
* The end of this region (in bp)<br>
*/
private int end;
public Region(Region copied) {
g = copied.g;
chrom = copied.chrom;
start = copied.start;
end = copied.end;
}
/**
* Creates a new Region that encompasses first and second. Throws
* IllegalArgumentException if <code>first</code> and <code>second</code>
* aren't on the same chromosome or they don't correspond to the same genome
*/
public Region(Region first, Region second) {
if (!first.g.equals(second.g)) {
throw new IllegalArgumentException();
}
if (!first.chrom.equals(second.chrom)) {
throw new IllegalArgumentException();
}
g = first.g;
chrom = first.chrom;
start = Math.min(first.start, second.start);
end = Math.max(first.end, second.end);
}
/**
* Creates a Region. Our codebase is probably not fully consistent about 0 or
* 1 based coordinates and whether the start and end are inclusive or
* exclusive.
*
* @param g
* : the genome
* @param c
* : the name of the chromosome
* @param start
* : the start coordinate
* @param end
* : the end coordinate
*/
public Region(Genome g, String c, int start, int end) {
if (start > end) {
throw new IllegalArgumentException(String.format("Start > End for this region : %d > %d", start, end));
}
if (g == null) {
throw new NullPointerException("Can't have a null genome");
}
if (c == null) {
throw new NullPointerException("Can't have a null chromosome");
}
this.g = g;
chrom = c;
this.start = start;
this.end = end;
}
public Region(Genome g, DataInputStream dis) throws IOException {
this.g = g;
chrom = dis.readUTF();
start = dis.readInt();
end = dis.readInt();
}
public void save(DataOutputStream dos) throws IOException {
dos.writeUTF(chrom);
dos.writeInt(start);
dos.writeInt(end);
}
public Genome getGenome() {
return g;
}
public String getChrom() {
return chrom;
}
public int getStart() {
return start;
}
public int getEnd() {
return end;
}
public char getStrand(){
return '.';
}
public Region clone() {
return new Region(g, chrom, start, end);
}
/**
* @deprecated Some objects rely on Region being immutable. Change the start
* and end of the region.
*/
protected void setStartEnd(int start, int end) {
this.start = start;
this.end = end;
if (this.start < 1) {
this.start = 1;
}
if (this.start > this.end) {
this.start = this.end;
}
}
/**
* Subtracts the input Region from this Region and returns the remainder.<br>
* In case, the two regions do not overlap, the region between them is
* returned.
*
* @throws IllegalArgumentException
* if <code>r</code> is not on the same chromosome as this or the
* two regions correspond to different genomes
*/
public Collection<Region> getSubtractionFragments(Region r) {
if (!g.equals(r.g)) {
throw new IllegalArgumentException();
}
if (!chrom.equals(r.chrom)) {
throw new IllegalArgumentException();
}
LinkedList<Region> lst = new LinkedList<Region>();
if (start != r.start || end != r.end) {
if (overlaps(r)) {
lst.addLast(new Region(g, chrom, Math.min(start, r.start), Math.max(start, r.start) - 1));
lst.addLast(new Region(g, chrom, Math.min(end, r.end) + 1, Math.max(end, r.end)));
}
else {
if (before(r)) {
lst.addLast(new Region(g, chrom, end + 1, r.start - 1));
}
else {
lst.addLast(new Region(g, chrom, r.end + 1, start - 1));
}
}
}
return lst;
}
/**
* Returns the distance between <code>r</code> and <code>this</code>.
* Overlapping Regions have distance zero.
*
* @throws IllegalArgumentException
* if <code>r</code> is not on the same chromosome as
* <code>this</code>
*/
public int distance(Region r) {
if (!chrom.equals(r.chrom)) {
throw new IllegalArgumentException(r.chrom + " is not " + chrom);
}
if (overlaps(r)) {
return 0;
}
if (before(r)) {
return r.start - end;
}
if (after(r)) {
return start - r.end;
}
return 0;
}
/**
* Returns the distance between <code>p</code> and <code>this</code>
*
* @throws IllegalArgumentException
* if <code>p</code> is not on the same chromosome as
* <code>this</code>
*/
public int distance(Point p) {
if (!chrom.equals(p.getChrom())) {
throw new IllegalArgumentException(p.getChrom());
}
if (p.getLocation() < start) {
return start - p.getLocation();
}
if (p.getLocation() > end) {
return p.getLocation() - end;
}
return 0;
}
public Region getOverlap(Region r) {
if (!overlaps(r)) {
return null;
}
int ns = Math.max(start, r.start);
int ne = Math.min(end, r.end);
return new Region(g, chrom, ns, ne);
}
/**
* Returns the number of bp overlap between <code>r</code> and
* <code>this</code>.
*/
public int getOverlapSize(Region r) {
if (!overlaps(r)) {
return 0;
}
int ns = Math.max(start, r.start), ne = Math.min(end, r.end);
return ne - ns + 1;
}
/**
* Returns the width (or size) of this region. This method assumes that both
* <code>start</code> and <code>end</code> are inclusive
*/
public int getWidth() {
return end - start + 1;
}
/**
* Returns true iff <code>this</code> comes before <code>r</code> along the
* chromosome.
*
* @throws IllegalArgumentException
* if <code>r</code> is not on the same chromosome as
* <code>this</code> or corresponds to a different genome
*/
public boolean before(Region r) {
if (!g.equals(r.g)) {
throw new IllegalArgumentException();
}
if (!chrom.equals(r.chrom)) {
throw new IllegalArgumentException();
}
return end < r.start;
}
/**
* Returns true iff <code>this</code> comes after <code>r</code> along the
* chromosome.
*
* @throws IllegalArgumentException
* if <code>r</code> is not on the same chromosome as
* <code>this</code> or corresponds to a different genome
*/
public boolean after(Region r) {
if (!g.equals(r.g)) {
throw new IllegalArgumentException();
}
if (!chrom.equals(r.chrom)) {
throw new IllegalArgumentException();
}
return start > r.end;
}
/**
* Returns a new Region that is <code>this</code> expanded by
* <code>dstart</code> bases to the left and <code>dend</code> bases to the
* right.<br>
* <i>Note</i>: If any of the arguments exceeds the corresponding ends of the
* chromosome, the expansion is limited to that end.
*/
public Region expand(int dstart, int dend) {
int chromLength = g.getChromLength(chrom);
int ns = start - dstart;
int ne = end + dend;
if (ns < 1) {
ns = 1;
}
if (ne > chromLength ) {
ne = chromLength ;
if (ns > ne) {
ns = ne;
}
}
return new Region(g, chrom, ns, ne);
}
/**
* Returns a new Region with the same midpoint as <code>this</code>
* but with width <code>width</code>
* @param width
* @return
*/
public Region resize(int width) {
return this.getMidpoint().expand(width/2);
}
/**
* Checks for full or partial overlapping between <code>this</code> and
* <code>r</code> region.
*
* @param r
* @return
*/
public boolean overlaps(Region r) {
if (!chrom.equals(r.chrom)) {
return false;
}
if (start <= r.start && end >= r.start) {
return true;
}
if (r.start <= start && r.end >= start) {
return true;
}
return false;
}
/**
* Checks for full or partial overlapping between <code>this</code> region and
* the region that corresponds between <code>otherstart</code> and
* <code>otherend</code>.
*
* @param otherstart
* @param otherend
* @return
*/
public boolean overlaps(int otherstart, int otherend) {
if (start <= otherstart && end >= otherstart) {
return true;
}
if (otherstart <= start && otherend >= start) {
return true;
}
return false;
}
/**
* Checks whether <tt>r</tt> region is fully contained in <tt>this</tt>
* region.
*
* @param r
* @return
*/
public boolean contains(Region r) {
if (!chrom.equals(r.chrom)) {
return false;
}
return start <= r.start && end >= r.end;
}
/**
* Checks whether point <tt>p</tt> is contained in <tt>this</tt> region.
*
* @param p
* @return
*/
public boolean contains(Point p) {
if (!chrom.equals(p.getChrom())) {
return false;
}
return start <= p.getLocation() && end >= p.getLocation();
}
public Point startPoint() {
return new Point(getGenome(), getChrom(), getStart());
}
public Point endPoint() {
return new Point(getGenome(), getChrom(), getEnd());
}
/**
* Checks whether region <tt>r</tt> is the same as <tt>this</tt> region.
*
* @param r
* @return
*/
public boolean matchesRegion(Region r) {
return g.equals(r.getGenome()) && chrom.equals(r.getChrom()) && (start == r.getStart()) && (end == r.getEnd());
}
/**
* Returns the <i>super-region</i> that contains both regions <tt>this</tt>
* and <tt>o</tt>.
*
* @param o
* @return
*/
public Region combine(Region o) {
if (!getChrom().equals(o.getChrom())) {
throw new IllegalArgumentException(getChrom() + " != " + o.getChrom());
}
int ns = Math.min(getStart(), o.getStart());
int ne = Math.max(getEnd(), o.getEnd());
return new Region(getGenome(), getChrom(), ns, ne);
}
/**
* Returns the mid-point (as Point) that corresponds to this region.
*
* @return
*/
public Point getMidpoint() {
int middle = (start + end) / 2;
return new Point(g, chrom, middle);
}
/**
* regionString() should <b>not</b> be overridden in any subclasses of Region
* -- it is meant to always return a string with the chromosome, start, and
* stop (and only that information).
*
* @return A string with the format, "chrom:start-end"
*/
public String regionString() {
return String.format("chr%s:%d-%d", chrom, start, end);
}
/**
*
* @param input
* @return
*/
public static boolean isValidCompleteRegionString(String input) {
String trimmed = input.trim();
Matcher regionMatcher = REGION_PATTERN.matcher(trimmed);
return regionMatcher.matches();
}
/**
*
* @param input
* @return
*/
public static boolean isValidPointRegionString(String input) {
String trimmed = input.trim();
Matcher pointMatcher = POINT_PATTERN.matcher(trimmed);
return pointMatcher.matches();
}
/**
*
* @param input
* @return
*/
public static boolean isValidChromRegionString(String input) {
String trimmed = input.trim();
Matcher chromMatcher = CHROM_PATTERN.matcher(trimmed);
return chromMatcher.matches();
}
/**
*
* @param input
* @return
*/
public static String chromFromLocationString(String input) {
String trimmed = input.trim();
Matcher chromMatcher = CHROM_PATTERN.matcher(trimmed);
String chromStr = null;
if (chromMatcher.find()) {
chromStr = chromMatcher.group(1);
if (chromStr.startsWith("chr")) {
chromStr = chromStr.substring(3, chromStr.length());
}
}
return chromStr;
}
/**
*
* @param input
* @return
*/
public static int startFromLocationString(String input) {
String trimmed = input.trim();
Matcher regionMatcher = REGION_PATTERN.matcher(trimmed);
int start = -1;
if (regionMatcher.matches()) {
String startStr = regionMatcher.group(2);
String endStr = regionMatcher.group(3);
startStr = startStr.replaceAll(",", "");
endStr = endStr.replaceAll(",", "");
start = Math.min(stringToNum(startStr), stringToNum(endStr));
}
return start;
}
/**
*
* @param input
* @return
*/
public static synchronized int stopFromLocationString(String input) {
String trimmed = input.trim();
Matcher regionMatcher = REGION_PATTERN.matcher(trimmed);
int end = -1;
if (regionMatcher.matches()) {
String startStr = regionMatcher.group(2);
String endStr = regionMatcher.group(3);
startStr = startStr.replaceAll(",", "");
endStr = endStr.replaceAll(",", "");
end = Math.max(stringToNum(startStr), stringToNum(endStr));
}
return end;
}
/**
* getLocationString() returns, by default, the same as regionString().
* However, this method can be overridden by subclasses to add additional
* information to the returned String. Any additional information added by a
* subclass should be separated by a ":". For instance, a StrandedRegion might
* return "chrom:start-stop:strand".
*
* @return A string representation of the Region's genomic location
*/
public String getLocationString() {
return regionString();
}
/**
* By default, returns the result of getLocationString(). Can be modified in
* any appropriate way by a subclass.
*
* @return A string represention of the object.
*/
public String toString() {
return getLocationString();
}
/**
* parses the input String into a Region. The Region can either be strict
* coordinates (eg, chrom/start/stop). Understands abbreviates in the
* coordinates such as k and m. <br>
* The method accepts either the form: <blockquote>
*
* <pre>
* chromosome:start-end:[strand] ([] stands for optional) - <i>strictly specified region</i>
* <i>or</i>
* chromosome:start:[strand] - <i>region from start to start+1</i>
* <i>or</i>
* chromosome:[strand] - <i>whole chromosome</i>
* </pre>
*
* </blockquote>
*/
public static Region fromString(Genome genome, String input) throws NumberFormatException {
String pieces[] = input.split(":");
char strand = ' ';
if (pieces.length == 3 && pieces[2].length() == 1) {
strand = pieces[2].charAt(0);
input = pieces[0] + ":" + pieces[1];
}
String trimmed = input.trim();
Matcher regionmatcher = REGION_PATTERN.matcher(trimmed);
Matcher pointmatcher = POINT_PATTERN.matcher(trimmed);
Matcher chrommatcher = CHROM_PATTERN.matcher(trimmed);
Region output = null;
if (regionmatcher.find()) {
if (regionmatcher.groupCount() != 3) {
return null;
}
String chromStr = regionmatcher.group(1);
String startStr = regionmatcher.group(2);
String endStr = regionmatcher.group(3);
if (chromStr.startsWith("chr")) {
chromStr = chromStr.substring(3, chromStr.length());
}
startStr = startStr.replaceAll(",", "");
endStr = endStr.replaceAll(",", "");
int start = Math.min(stringToNum(startStr), stringToNum(endStr));
int end = Math.max(stringToNum(startStr), stringToNum(endStr));
output = new Region(genome, chromStr, start, end);
}
else if (pointmatcher.find()) {
if (pointmatcher.groupCount() != 2) {
return null;
}
String chromStr = pointmatcher.group(1);
String startStr = pointmatcher.group(2);
if (chromStr.startsWith("chr")) {
chromStr = chromStr.substring(3, chromStr.length());
}
startStr = startStr.replaceAll(",", "");
output = new Region(genome, chromStr, stringToNum(startStr), stringToNum(startStr) + 1);
}
else if (chrommatcher.matches()) {
String chromStr = chrommatcher.group(1);
if (chromStr.startsWith("chr")) {
chromStr = chromStr.substring(3, chromStr.length());
}
ChromosomeInfo info = genome.getChrom(chromStr);
if (info != null) {
int length = info.getLength();
output = new Region(genome, chromStr, 1, length );
}
}
if (output != null && strand != ' ') {
output = new StrandedRegion(output, strand);
}
return output;
}
/**
* Parses integers but understand k and m abbreviations for kilo and mega.
*/
public static int stringToNum(String s) {
if (s.matches(".*[kK]$")) {
return 1000 * Integer.parseInt(s.substring(0, s.length() - 1));
}
if (s.matches(".*[mM]$")) {
return 1000000 * Integer.parseInt(s.substring(0, s.length() - 1));
}
return Integer.parseInt(s);
}
public static boolean[] overlap(List<Region> sourceRegions, List<Region> targetRegions) {
return overlap(sourceRegions.toArray(new Region[0]), targetRegions.toArray(new Region[0]));
}//end of overlap method
/**
* This method finds all possible overlaps between <tt>N source</tt> and <tt>M target</tt>
* regions. <br>
* It runs in <tt>O(N log(M)) + O(M log(M))</tt> time compared to the naive (nested loop) search
* between all the elements of <tt>source</tt> and <tt>target</tt> regions which runs in O(N M) time.
* @param sourceRegions <tt>N</tt> source regions where we want to find overlaps between these and the <tt>target</tt> regions
* @param targetRegions <tt>M</tt> target regions
* @return <tt>true</tt> in all positions where there is an overlap between a <tt>source</tt> and
* a <tt>target</tt> region. <tt>false</tt>, otherwise.
*/
public static boolean[] overlap(Region[] sourceRegions, Region[] targetRegions) {
boolean[] overlaps = new boolean[sourceRegions.length];
if(sourceRegions.length == 0 || targetRegions.length == 0) { return overlaps; }
Arrays.sort(targetRegions);
int[] target_start = new int[targetRegions.length];
for(int i = 0; i < target_start.length; i++) { target_start[i] = targetRegions[i].getStart(); }
int[] target_end = new int[targetRegions.length];
for(int i = 0; i < target_end.length; i++) { target_end[i] = targetRegions[i].getEnd(); }
int[] idx = StatUtil.findSort(target_end);
for(Region sourceReg:sourceRegions) {
boolean sourceRegOverlaps = false;
int start_idx = Arrays.binarySearch(target_end, sourceReg.getStart());
int end_idx = Arrays.binarySearch(target_start, sourceReg.getEnd());
if( start_idx < 0 ) { start_idx = Math.min(-start_idx - 1, target_start.length-1); }
if( end_idx < 0 ) { end_idx = Math.min(-end_idx - 1 , target_start.length-1); }
for(int i = idx[start_idx]; i <= end_idx; i++) {
if(targetRegions[i].overlaps(sourceReg)) {
sourceRegOverlaps = true;
break;
}
}
if(!sourceRegOverlaps && (end_idx - idx[start_idx] + 1 != target_start.length)) {
for(int i = Math.min(idx[start_idx]+1, target_start.length-1); i > 0; i--) {
if(targetRegions[i].overlaps(targetRegions[i-1])) {
if(targetRegions[i-1].overlaps(sourceReg)){
sourceRegOverlaps = true;
break;
}
}
else
break;
}
}
}
return overlaps;
}//end of overlap method
public boolean equals(Object o) {
if (o instanceof Region) {
Region r = (Region) o;
return matchesRegion(r);
}
else {
return false;
}
}
public int hashCode() {
int code = 17;
code += g.hashCode();
code *= 37;
code += chrom.hashCode();
code *= 37;
code += start;
code *= 37;
code += end;
code *= 37;
return code;
}
/*
* (non-Javadoc)
*
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
public int compareTo(Region r) {
if (!chrom.equals(r.chrom)) {
return chrom.compareTo(r.chrom);
}
if (start < r.start) {
return -1;
}
if (start > r.start) {
return 1;
}
if (end < r.end) {
return -1;
}
if (end > r.end) {
return 1;
}
return 0;
}
/**
* Given a list of regions, filter out all the regions that overlaps with any reference regions, return the non-overlap regions
* @param rs
* @param refs
* @return
*/
public static List<Region> filterOverlapRegions(List<Region> rs, List<Region> refs){
List<Region> results = new ArrayList<Region>();
Map<String, List<Region>> chr2rs = new HashMap<String, List<Region>>();
// group regions by chrom
for(Region r:rs) {
String chrom = r.getChrom();
if(!chr2rs.containsKey(chrom))
chr2rs.put(chrom, new ArrayList<Region>());
chr2rs.get(chrom).add(r);
}
Map<String, List<Region>> chr2refs = new HashMap<String, List<Region>>();
for(Region r:refs) {
String chrom = r.getChrom();
if(!chr2rs.containsKey(chrom))
continue;
if(!chr2refs.containsKey(chrom))
chr2refs.put(chrom, new ArrayList<Region>());
chr2refs.get(chrom).add(r);
}
// for each chrom
for (String chr : chr2rs.keySet()){
List<Region> rsc = chr2rs.get(chr);
if (!chr2refs.containsKey(chr)){
results.addAll(rsc);
continue;
}
List<Region> refsc = chr2refs.get(chr);
Collections.sort(rsc);
Collections.sort(refsc);
int rId=0;
int refId=0;
while(rId<rsc.size()){
Region r = rsc.get(rId);
Region ref = refsc.get(refId);
if (r.overlaps(ref)){
rId++;
continue;
}
else{
if(r.before(ref)){
results.add(r);
rId++;
continue;
}
else if(r.after(ref)){
refId++;
if (refId==refsc.size()){ // end of ref regions
for (int i=rId+1;i<rsc.size();i++)
results.add(rsc.get(i));
break;
}
}
}
}
}
return results;
}
/**
* Merge the overlapped regions
* @param regions
* @return
*/
public static List<Region> mergeRegions(List<Region> regions){
if (regions.isEmpty())
return regions;
List<Region> mergedRegions = new ArrayList<Region>();
Collections.sort(regions);
Region previous = regions.get(0);
for (int i = 1; i < regions.size(); i++) {
Region region = regions.get(i);
// if overlaps with previous region, combine the regions
if (previous.overlaps(region)){
previous = previous.combine(region);
} else{
mergedRegions.add(previous);
previous = region;
}
}
mergedRegions.add(previous);
return mergedRegions;
}//end of mergeRegions method
}