Making Hilbert Curve ======================================== **Author**: Zuguang Gu ( z.gu@dkfz.de ) **Date**: `r Sys.Date()` ------------------------------------------------------------- ```{r, echo = FALSE, message = FALSE} library(markdown) options(markdown.HTML.options = c(options('markdown.HTML.options')[[1]], "toc")) library(knitr) knitr::opts_chunk$set( error = FALSE, tidy = FALSE, message = FALSE, fig.align = "center", fig.width = 6, fig.height = 6) options(markdown.HTML.stylesheet = "custom.css") options(width = 100) ``` [Hilbert curve](https://en.wikipedia.org/wiki/Hilbert_curve) is a type of space-filling curves that fold one dimensional axis into a two dimensional space, but with still keeping the locality. It has advantages to visualize data with long axis in following two aspects: 1. greatly improve resolution for the visualization; 2. easy to visualize clusters because generally data points in the cluster will also be close in the Hilbert curve. This package aims to provide an easy and flexible way to visualize data through Hilbert curve. The implementation and example figures are based on following sources: - http://mkweb.bcgsc.ca/hilbert/ - http://corte.si/posts/code/hilbert/portrait/index.html - http://bioconductor.org/packages/devel/bioc/html/HilbertVis.html ```{r} suppressPackageStartupMessages(library(HilbertCurve)) library(circlize) set.seed(12345) ``` Following shows Hilbert curve with level 2, 3, 4, 5: ```{r, fig.width = 12, fig.height = 3, echo = FALSE} grid.newpage() pushViewport(viewport(layout = grid.layout(nr = 1, nc = 4))) pushViewport(viewport(layout.pos.row = 1, layout.pos.col = 1)) HilbertCurve(1, 100, level = 2, reference = TRUE, newpage = FALSE, title = "level = 2") upViewport() pushViewport(viewport(layout.pos.row = 1, layout.pos.col = 2)) HilbertCurve(1, 100, level = 3, reference = TRUE, newpage = FALSE, title = "level = 3") upViewport() pushViewport(viewport(layout.pos.row = 1, layout.pos.col = 3)) HilbertCurve(1, 100, level = 4, reference = TRUE, newpage = FALSE, title = "level = 4") upViewport() pushViewport(viewport(layout.pos.row = 1, layout.pos.col = 4)) HilbertCurve(1, 100, level = 5, reference = TRUE, arrow = FALSE, newpage = FALSE, title = "level = 5") upViewport() upViewport() ``` Following heatmap shows distance between elements in the Hilbert curve. From left to right and from top to bottom, the order is the natural order of data points and colors correspond to the distance for pair distance in the Hilbert curve. Basically, if data points are close in the one dimensional axis, they also have small distance in the Hilbert curve (regions around diagonals) ```{r, fig.width = 7} library(HilbertVis) pos = HilbertVis::hilbertCurve(5) mat = as.matrix(dist(pos)) library(ComplexHeatmap) ht = Heatmap(mat, name = "dist", cluster_rows = FALSE, cluster_columns = FALSE, show_row_names = FALSE, show_column_names = FALSE, heatmap_legend_param = list(title = "euclidean_dist")) draw(ht, padding = unit(c(5, 5, 5, 2), "mm")) decorate_heatmap_body("dist", { grid.segments(c(0.25, 0.5, 0.75, 0, 0, 0), c(0, 0, 0, 0.25, 0.5, 0.75), c(0.25, 0.5, 0.75, 1, 1, 1), c(1, 1, 1, 0.25, 0.5, 0.75), gp = gpar(lty = 2)) grid.text(rev(c(256, 512, 768, 1024)), 0, c(0, 256, 512, 768)/1024, just = "bottom", rot = 90, gp = gpar(fontsize = 10)) grid.text(c(1, 256, 512, 768, 1024), c(1, 256, 512, 768, 1024)/1024, 1, just = "bottom", gp = gpar(fontsize = 10)) }) ``` ## Basic settings Generally, customizing a Hilbert curve follows following steps: ```{r, eval = FALSE} hc = HilbertCurve(...) hc_points(hc, ...) hc_rect(hc, ...) hc_segments(hc, ...) hc_text(hc, ...) ``` `HilbertCurve()` is a constructor function and initializes the Hilbert curve. Here the start and end position that correspond to the two ends of the Hilbert curve are specified as well as the level of the Hilbert curve. The function returns a `HilbertCurve` class instance and it can be used to add more graphics later. ```{r, eval = FALSE} hc = HilbertCurve(1, 100, level = 4, reference = TRUE) ``` The curve can be though as a folded axis. When the coordinate for this folded axis is initialized, low-level graphics can be added with specifying the positions. Before we demonstrate low-level functions, first we generate a random regions by **IRanges** package. ```{r} x = sort(sample(100, 20)) s = x[1:10*2 - 1] e = x[1:10*2] ir = IRanges(s, e) ir ``` ### Points There are two modes for adding points. By default, every segment in the Hilbert curve are segmented into several small segments and a circle is plotted at every small segments. ```{r} hc = HilbertCurve(1, 100, level = 4, reference = TRUE) hc_points(hc, ir) ``` The number of circles in the small segment can be controlled by `np` and graphical parameters can be set by `gp`. Note under this mode, the size of points can only be changed by `np` argument. This mode is useful if you have long regions. ```{r} hc = HilbertCurve(1, 100, level = 4, reference = TRUE) hc_points(hc, ir, np = 3, gp = gpar(fill = rand_color(length(ir)))) ``` Here such circles are not real points in R, they are polygons actually. There are some pre-defined shapes that you can choose from: "circle", "square", "triangle", "hexagon", "star". If `np` is set less than 2 or `NULL`, the points will be plotted at the center of every interval in `ir`. In this case, `size` argument is used to control the size of the points. This mode is useful if you have a lot of small regions. ```{r} hc = HilbertCurve(1, 100, level = 4, reference = TRUE) hc_points(hc, ir, np = NULL, size = unit(runif(length(ir)), "cm"), pch = 16) ``` ### Segments Adding segments is straightforward. ```{r} hc = HilbertCurve(1, 100, level = 4, reference = TRUE) hc_segments(hc, ir) ``` ### Rectangles Adding rectangles is straightforward. Note You cannot set the width or height of the rectangles. Rectangles are always located at the turning points and have width or height equal to the length of the segments. ```{r} hc = HilbertCurve(1, 100, level = 4, reference = TRUE) hc_rect(hc, ir) ``` As you can see some rectangles are not full red. It is because these segments in the curve do not full cover regions in `ir`, Averaging is applied in such segments. Actually it is also applicable for the 'segmented circle' mode for `hc_points`. Graphical values that correspond to `ir` are always represented as numeric values, such as size of points, width of lines or even colors which can be converted to numeric RGB values. If a segment in the Hilbert curve can not full overlap with some intervals in `ir`, these numeric parameters can be averaged. There are three different modes for averaging which are controlled by `mean_mode` argument. Following illustrates different settings for `mean_mode`: ``` 100 80 60 values in ir (e.g. red compoment for colors) ++++++ +++ +++++ ir ================ window (width = 16) 4 3 3 overlap absolute: (100 + 80 + 60)/3 weighted: (100*4 + 80*3 + 60*3)/(4 + 3 + 3) w0: (100*4 + 80*3 + 60*3)/16 ``` So use of the mode depends on specific scenario. For example, if `ir` corresponds to positions of genes, then the mode of `w0` is perhaps a good choise. If `ir` corresponds to positions of CpG sites which is has width of 1 and most of the time is sparse in genomic windows, then `absolute` is a correct choice. ### Text Adding texts is straightforward. Note text is added at the center of each interval in `ir`. ```{r} hc = HilbertCurve(1, 100, level = 4, reference = TRUE) labels = sample(letters, length(ir), replace = TRUE) hc_text(hc, ir, labels = labels) ``` ### Combine low-level functions With combination of these basic graphic functions, complicated graphics can be easily made: ```{r} hc = HilbertCurve(1, 100, level = 4) hc_segments(hc, IRanges(1, 100)) # background line hc_rect(hc, ir) hc_points(hc, ir, np = 3) hc_text(hc, ir, labels = labels, gp = gpar(fontsize = 16, col = "blue")) ``` ## Non-integer positions It doesn't matter if your positions are integers or not. Internally, rounding and zooming are applied to ensure the accuracy. Since the positions are not integers, you can specify the positions by `x1` and `x2`. All low-level graphical funtions accept `x1` and `x2`. ```{r} hc = HilbertCurve(0.1, 0.8, level = 4, reference = TRUE) hc_points(hc, x1 = c(0.15, 0.55), x2 = c(0.25, 0.65)) ``` ## Add title and legends Title are allowed by setting `title` argument. Legend can be passed to `legend` argument in `HilbertCurve()` as a `grob` object or a list of `grob` objects. `ColorMapping` class in **ComplexHeatmap** package or `legendGrob()` in **grid** package can be used to create a legend `grob`. Or you can consider to use `frameGrob()` and `packGrob()` to build a legend from ground. Because width and height are enforced to be equal for the Hilbert curve, sometimes you may see blank spaces around the curve. ```{r} value = runif(length(ir)) col_fun = colorRamp2(c(0, 1), c("white", "red")) cm = ColorMapping(col_fun = col_fun) legend = color_mapping_legend(cm, plot = FALSE, title = "value") hc = HilbertCurve(1, 100, reference = TRUE, title = "points", legend = legend) hc_points(hc, ir, np = 3, gp = gpar(fill = col_fun(value))) ``` ## Pixel mode When the level is high (e.g. > 10), the whole 2D space will be almost completely filled by the curve and it is impossible to add or visualize e.g. points on the curve. In this case, the 'pixel' mode visualizes each tiny 'segment' as a pixel and maps values to colors. So the Hilbert curve with level 11 will generate a PNG figure with 2048x2048 resolution. This is extremely useful for visualize genomic data. E.g. If we make a Hilbert curve for human chromosome 1 with level 11, then each pixel can represent 60bp (``249250621/2048/2048``) which is of very high resolution. Under 'pixel' mode, if the current device is an interactive deivce, every time a new layer is added, the image will be add to the interactive device as a rastered image. ```{r} hc = HilbertCurve(1, 100, level = 9, mode = "pixel") hc_layer(hc, ir) ``` Since you can only use color to map to other values, there is only one graphical setting `col`. You can add more than one layers, just remember to set transparent colors. And of course you can add title and legend to the plot. Grid lines can be added to the plot for better distinguish blocks in the Hilbert curve. ```{r} hc = HilbertCurve(1, 100, level = 9, mode = "pixel", title = "pixel mode", legend = legend) hc_layer(hc, ir, col = col_fun(value), grid_line = 2) hc_layer(hc, x1 = 75, x2 = 85, col = "#0000FF10") ``` The Hilbert curve can be save as PNG figure by `hc_png()`, with resolution `2^level x 2^level`. ```{r, eval = FALSE} hc_png(hc, file = "test.png") ``` ## Examples Visualize rainbow colors: ```{r} col = rainbow(100) hc = HilbertCurve(1, 100, level = 5) ir = IRanges(start = 1:99, end = 2:100) hc_points(hc, ir, np = 4, gp = gpar(col = col, fill = col)) ``` Genes on chromosome 1 (RefSeq genes for human, hg19). Here random colors are used to represent to different genes. ```{r} chr1_len = 249250621 load(paste0(system.file("extdata", package = "HilbertCurve"), "/refseq_chr1.RData")) hc = HilbertCurve(1, chr1_len, level = 5) hc_segments(hc, IRanges(start = 1, end = chr1_len), gp = gpar(col = "grey")) hc_segments(hc, ranges(g), gp = gpar(lwd = unit(6, "mm"), col = rand_color(length(g)))) ``` Following two figures compare sequence conservations. Data are from [here](http://genome.ucsc.edu/cgi-bin/hgTables?hgsid=434791817_tLOpTRgCwS2hR4uJ2cDjGaHUQEKj&clade=mammal&org=Human&db=hg19&hgta_group=compGeno&hgta_track=placentalChainNet&hgta_table=0&hgta_regionType=genome&position=chr16%3A81839239-81902938&hgta_outputType=wigData&hgta_outFileName=). ```{r} load(paste0(system.file("extdata", package = "HilbertCurve"), "/mouse_net.RData")) hc = HilbertCurve(1, chr1_len, level = 6) ir1 = ranges(mouse) ir2 = setdiff(IRanges(1, chr1_len), ir1) ir = c(ir1, ir2) col = c(rep("red", length(ir1)), rep("yellow", length(ir2))) hc_points(hc, ir, np = 3, gp = gpar(col = col, fill = col)) load(paste0(system.file("extdata", package = "HilbertCurve"), "/zebrafish_net.RData")) hc = HilbertCurve(1, chr1_len, level = 6) ir1 = ranges(zebrafish) ir2 = setdiff(IRanges(1, chr1_len), ir1) ir = c(ir1, ir2) col = c(rep("red", length(ir1)), rep("yellow", length(ir2))) hc_points(hc, ir, np = 3, gp = gpar(col = col, fill = col)) ``` GC percent on chromosome 1, under "normal" mode: ```{r, eval = FALSE} df = read.table(pipe("awk '$1==\"chr1\"' /path-to/hg19_gc_percent_window1000.bed")) col_fun = colorRamp2(c(0, 500, 1000), c("green", "#FFFFCC", "red")) png("gc_percent_chr1_points.png", width = 500, height = 500) hc = HilbertCurve(1, chr1_len, level = 6) hc_points(hc, IRanges(df[[2]], df[[3]]), np = 3, gp = gpar(fill = col_fun(df[[5]]), col = col_fun(df[[5]]))) hc_rect(hc, reduce(ranges(g)), gp = gpar(fill = "#00000020", col = "#00000020")) dev.off() ```