14 Functional Analysis

14.1 Google Slides

14.2 Gene sets and signatures

14.2.1 Cell Cycle

marrow <- CellCycleScoring(object = marrow, s.genes = s.genes, g2m.genes = g2m.genes, 
    set.ident = TRUE)

# view cell cycle scores and phase assignments
head(x = marrow@meta.data)
# Visualize the distribution of cell cycle markers across
RidgePlot(object = marrow, features.plot = c("PCNA", "TOP2A", "MCM6", "MKI67"), nCol = 2)
# Running a PCA on cell cycle genes reveals, unsurprisingly, that cells
# separate entirely by phase
marrow <- RunPCA(object = marrow, pc.genes = c(s.genes, g2m.genes), do.print = FALSE)
PCAPlot(object = marrow)

14.3 Pathway analysis

14.4 inferCNV / honeybadger

Github Page

14.4.1 Create the InferCNV Object

Reading in the raw counts matrix and meta data, populating the infercnv object

infercnv_obj = CreateInfercnvObject(
  raw_counts_matrix="../example/oligodendroglioma_expression_downsampled.counts.matrix",
  annotations_file="../example/oligodendroglioma_annotations_downsampled.txt",
  delim="\t",
  gene_order_file="../example/gencode_downsampled.txt",
  ref_group_names=c("Microglia/Macrophage","Oligodendrocytes (non-malignant)"))

14.4.2 Filtering genes

Removing those genes that are very lowly expressed or present in very few cells

# filter out low expressed genes
cutoff=1
infercnv_obj <- require_above_min_mean_expr_cutoff(infercnv_obj, cutoff)
# filter out bad cells
min_cells_per_gene=3
infercnv_obj <- require_above_min_cells_ref(infercnv_obj, min_cells_per_gene=min_cells_per_gene)
## for safe keeping
infercnv_orig_filtered = infercnv_obj
#plot_mean_chr_expr_lineplot(infercnv_obj)
save('infercnv_obj', file = '../example_output/infercnv_obj.orig_filtered')

14.4.3 Normalize each cell’s counts for sequencing depth

infercnv_obj <- infercnv:::normalize_counts_by_seq_depth(infercnv_obj)

14.4.4 Perform Anscombe normalization

Suggested by Matan for removing noisy variation at low counts

infercnv_obj <- infercnv:::anscombe_transform(infercnv_obj)

14.4.5 Log transform the normalized counts:

infercnv_obj <- log2xplus1(infercnv_obj)

14.4.6 Apply maximum bounds to the expression data to reduce outlier effects

threshold = mean(abs(get_average_bounds(infercnv_obj)))
infercnv_obj <- apply_max_threshold_bounds(infercnv_obj, threshold=threshold)

14.4.7 Initial view, before inferCNV operations:

plot_cnv(infercnv_obj,
         out_dir='../example_output/', 
         output_filename='infercnv.logtransf', 
         x.range="auto", 
         title = "Before InferCNV (filtered & log2 transformed)", 
         color_safe_pal = FALSE, 
         x.center = mean(infercnv_obj@expr.data))

14.4.8 Perform smoothing across chromosomes

infercnv_obj = smooth_by_chromosome(infercnv_obj, window_length=101, smooth_ends=TRUE)
# re-center each cell
infercnv_obj <- center_cell_expr_across_chromosome(infercnv_obj, method = "median")
plot_cnv(infercnv_obj, 
         out_dir='../example_output/',
         output_filename='infercnv.chr_smoothed', 
         x.range="auto", 
         title = "chr smoothed and cells re-centered", 
         color_safe_pal = FALSE)

14.4.9 Subtract the reference values from observations, now have log(fold change) values

infercnv_obj <- subtract_ref_expr_from_obs(infercnv_obj, inv_log=TRUE)
plot_cnv(infercnv_obj,
         out_dir='../example_output/',
         output_filename='infercnv.ref_subtracted', 
         x.range="auto", 
         title="ref subtracted", 
         color_safe_pal = FALSE)

14.4.10 Invert log values

Converting the log(FC) values to regular fold change values, centered at 1 (no fold change)

This is important because we want (1/2)x to be symmetrical to 1.5x, representing loss/gain of one chromosome region.

infercnv_obj <- invert_log2(infercnv_obj)
plot_cnv(infercnv_obj, 
         out_dir='../example_output/',
         output_filename='infercnv.inverted', 
         color_safe_pal = FALSE, 
         x.range="auto", 
         x.center=1, 
         title = "inverted log FC to FC")

14.4.11 Removing noise

infercnv_obj <- clear_noise_via_ref_mean_sd(infercnv_obj, sd_amplifier = 1.5)
plot_cnv(infercnv_obj,
         out_dir='../example_output/',
         output_filename='infercnv.denoised', 
         x.range="auto", 
         x.center=1, 
         title="denoised", 
         color_safe_pal = FALSE)

14.4.12 Remove outlier data points

This generally improves on the visualization

infercnv_obj = remove_outliers_norm(infercnv_obj)
    plot_cnv(infercnv_obj, 
         out_dir='../example_output/',
         output_filename='infercnv.outliers_removed', 
         color_safe_pal = FALSE, 
         x.range="auto", 
         x.center=1, 
         title = "outliers removed")

14.4.13 Find DE genes by comparing the mutant types to normal types, BASIC

Runs a t-Test comparing tumor/normal for each patient and normal sample, and masks out those genes that are not significantly DE.

plot_data = infercnv_obj@expr.data
high_threshold = max(abs(quantile(plot_data[plot_data != 0], c(0.05, 0.95))))  
low_threshold = -1 * high_threshold 
infercnv_obj2 <- infercnv:::mask_non_DE_genes_basic(infercnv_obj, test.use = 't', center_val=1)
plot_cnv(infercnv_obj2, 
         out_dir='../example_output/',
         output_filename='infercnv.non-DE-genes-masked', 
         color_safe_pal = FALSE, 
         x.range=c(low_threshold, high_threshold), 
         x.center=1, 
         title = "non-DE-genes-masked")

14.4.14 Additional Information

14.4.14.1 Online Documentation

For additional explanations on files, usage, and a tutorial please visit the wiki.

14.4.14.2 TrinityCTAT

This tool is a part of the TrinityCTAT toolkit focused on leveraging the use of RNA-Seq to better understand cancer transcriptomes. To find out more please visit TrinityCTAT