OMICS workshop I Exercise - Solutions
R
tidyverse
omics
workshop
Solutions: Integrative Tidy Omics data Wrangling
Setup
library(tidyverse)
set.seed(12345) # for reproducibilityData generation
# 1) Sample metadata
sample_meta <- tibble(
sample_id = str_c("S", str_pad(1:30, width = 3, pad = "0")),
tissue = sample(c("tumor", "normal"), 30, replace = TRUE, prob = c(0.55, 0.45)),
cohort = sample(c("A", "B", "C"), 30, replace = TRUE),
batch = sample(c("B1", "B2", "B3"), 30, replace = TRUE)
)
# 2) Clinical table (with intentional unmatched rows for join debugging)
clinical <- tibble(
sample_id = c(sample_meta$sample_id[1:26], "S900", "S901"),
patient_age = sample(35:85, 28, replace = TRUE),
response = sample(c("CR", "PR", "SD", "PD"), 28, replace = TRUE)
)
# 3) Wide expression/count matrix (genes x samples)
genes <- str_c("GENE_", str_pad(1:120, 3, pad = "0"))
count_mat <- matrix(
rnbinom(length(genes) * nrow(sample_meta), mu = 120, size = 1.3),
nrow = length(genes),
ncol = nrow(sample_meta),
dimnames = list(genes, sample_meta$sample_id)
)
counts_wide <- as_tibble(count_mat, rownames = "gene_id")
sample_meta
clinical
counts_widePart 1 — Join diagnostics and clean analysis table
# IDs in metadata that do not exist in clinical
missing_in_clinical <- sample_meta %>%
anti_join(clinical, by = "sample_id")
# IDs in clinical that do not exist in metadata
extra_in_clinical <- clinical %>%
anti_join(sample_meta, by = "sample_id")
# Keep only clinical rows that can match metadata
clinical_clean <- clinical %>%
semi_join(sample_meta, by = "sample_id")
# Annotated sample table
sample_annot <- sample_meta %>%
left_join(clinical_clean, by = "sample_id") %>%
mutate(
age_group = case_when(
is.na(patient_age) ~ NA_character_,
patient_age < 50 ~ "young",
patient_age < 70 ~ "middle",
TRUE ~ "older"
)
)
missing_in_clinical
extra_in_clinical
sample_annotPart 2 — Reshape wide counts and annotate samples
counts_long <- counts_wide %>%
pivot_longer(
cols = -gene_id,
names_to = "sample_id",
values_to = "count"
)
counts_annot <- counts_long %>%
left_join(sample_annot, by = "sample_id") %>%
mutate(log_count = log2(count + 1))
counts_long
counts_annotPart 3 — Multi-level summaries
gene_tissue_summary <- counts_annot %>%
group_by(gene_id, tissue) %>%
summarise(
mean_count = mean(count, na.rm = TRUE),
sd_count = sd(count, na.rm = TRUE),
mean_log_count = mean(log_count, na.rm = TRUE),
n_samples = n(),
.groups = "drop"
)
cohort_summary <- counts_annot %>%
group_by(cohort, tissue) %>%
summarise(
n_rows = n(),
across(
where(is.numeric),
list(
mean = ~ mean(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE)
),
.names = "{.col}_{.fn}"
),
.groups = "drop"
)
gene_tissue_summary
cohort_summaryPart 4 — Wide result table and differential signal
top_genes_delta <- gene_tissue_summary %>%
select(gene_id, tissue, mean_count) %>%
pivot_wider(
names_from = tissue,
values_from = mean_count,
names_prefix = "mean_count_"
) %>%
mutate(
delta = mean_count_tumor - mean_count_normal,
abs_delta = abs(delta)
) %>%
arrange(desc(abs_delta)) %>%
slice_head(n = 15)
top_genes_deltaDeliverables
sample_annot
counts_annot
gene_tissue_summary
cohort_summary
top_genes_deltaStretch challenge (optional)
# Cohort-wise gene delta
gene_tissue_cohort <- counts_annot %>%
group_by(cohort, gene_id, tissue) %>%
summarise(mean_count = mean(count, na.rm = TRUE), .groups = "drop")
cohort_gene_delta <- gene_tissue_cohort %>%
pivot_wider(
names_from = tissue,
values_from = mean_count,
names_prefix = "mean_count_"
) %>%
mutate(
delta = mean_count_tumor - mean_count_normal,
abs_delta = abs(delta)
)
# Top 3 genes by cohort
top3_by_cohort <- cohort_gene_delta %>%
group_by(cohort) %>%
slice_max(order_by = abs_delta, n = 3, with_ties = FALSE) %>%
summarise(top3_genes = str_c(gene_id, collapse = ", "), .groups = "drop")
# Cohort-level clinical summaries
cohort_sample_stats <- sample_annot %>%
group_by(cohort) %>%
summarise(
mean_age = mean(patient_age, na.rm = TRUE),
cr_fraction = mean(response == "CR", na.rm = TRUE),
.groups = "drop"
)
stretch_report <- top3_by_cohort %>%
left_join(cohort_sample_stats, by = "cohort")
stretch_report