commit 395c18a9e62d5d03fdf7042cc72582daf331b8e6 Author: Hamza Ansari Date: Sun Jun 21 06:39:53 2026 -0400 Initial RNA-seq DESeq2 pipeline manifests diff --git a/01-pvc.yaml b/01-pvc.yaml new file mode 100644 index 0000000..85facd7 --- /dev/null +++ b/01-pvc.yaml @@ -0,0 +1,13 @@ +# 01-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: rnaseq-workspace + namespace: rnaseq +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 30Gi diff --git a/02-job-sra-download.yaml b/02-job-sra-download.yaml new file mode 100644 index 0000000..d4f7a81 --- /dev/null +++ b/02-job-sra-download.yaml @@ -0,0 +1,59 @@ +# 02-job-sra-download.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: sra-download + namespace: rnaseq +spec: + backoffLimit: 2 + template: + spec: + restartPolicy: Never + + initContainers: + - name: download-sra + image: ncbi/sra-tools:3.0.0 + command: ["/bin/sh", "-c"] + args: + - | + set -e + mkdir -p /data/raw + + echo "=== Downloading ERR458493 (WT, single-end) ===" + fasterq-dump ERR458493 --outdir /data/raw --temp /tmp --threads 4 + + echo "=== Downloading ERR458500 (snf2 mutant, single-end) ===" + fasterq-dump ERR458500 --outdir /data/raw --temp /tmp --threads 4 + + echo "=== Compressing ===" + gzip /data/raw/*.fastq + + echo "=== Done ===" + ls -lh /data/raw/ + resources: + requests: + memory: "1Gi" + cpu: "2" + limits: + memory: "2Gi" + cpu: "4" + volumeMounts: + - name: workspace + mountPath: /data + + containers: + - name: verify + image: busybox:1.36 + command: ["/bin/sh", "-c"] + args: + - | + echo "=== Raw files ===" + ls -lh /data/raw/ + volumeMounts: + - name: workspace + mountPath: /data + + volumes: + - name: workspace + persistentVolumeClaim: + claimName: rnaseq-workspace diff --git a/02b-job-sra-download-extra.yaml b/02b-job-sra-download-extra.yaml new file mode 100644 index 0000000..e9bbc12 --- /dev/null +++ b/02b-job-sra-download-extra.yaml @@ -0,0 +1,42 @@ +# 02b-job-sra-download-extra.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: sra-download-extra + namespace: rnaseq +spec: + backoffLimit: 2 + template: + spec: + restartPolicy: Never + initContainers: + - name: download-sra + image: ncbi/sra-tools:3.0.0 + command: ["/bin/sh", "-c"] + args: + - | + set -e + for acc in ERR458494 ERR458495 ERR458501 ERR458502; do + echo "=== Downloading $acc ===" + fasterq-dump $acc --outdir /data/raw --temp /tmp --threads 4 + done + gzip /data/raw/*.fastq + ls -lh /data/raw/ + resources: + requests: { memory: "1Gi", cpu: "2" } + limits: { memory: "2Gi", cpu: "4" } + volumeMounts: + - name: workspace + mountPath: /data + containers: + - name: verify + image: busybox:1.36 + command: ["/bin/sh", "-c"] + args: ["ls -lh /data/raw/"] + volumeMounts: + - name: workspace + mountPath: /data + volumes: + - name: workspace + persistentVolumeClaim: + claimName: rnaseq-workspace diff --git a/03-job-fastqc.yaml b/03-job-fastqc.yaml new file mode 100644 index 0000000..181629a --- /dev/null +++ b/03-job-fastqc.yaml @@ -0,0 +1,43 @@ +# 03-job-fastqc.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: fastqc + namespace: rnaseq +spec: + backoffLimit: 2 + template: + spec: + restartPolicy: Never + + containers: + - name: fastqc + image: biocontainers/fastqc:v0.11.9_cv8 + command: ["/bin/sh", "-c"] + args: + - | + set -e + echo "=== Running FastQC on all samples ===" + fastqc \ + /data/raw/ERR458493.fastq.gz \ + /data/raw/ERR458500.fastq.gz \ + --outdir /data/fastqc \ + --threads 4 + + echo "=== Done ===" + ls -lh /data/fastqc/ + resources: + requests: + memory: "1Gi" + cpu: "2" + limits: + memory: "2Gi" + cpu: "4" + volumeMounts: + - name: workspace + mountPath: /data + + volumes: + - name: workspace + persistentVolumeClaim: + claimName: rnaseq-workspace diff --git a/04-job-star.yaml b/04-job-star.yaml new file mode 100644 index 0000000..02f0718 --- /dev/null +++ b/04-job-star.yaml @@ -0,0 +1,62 @@ +# 04-job-star.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: star-align + namespace: rnaseq +spec: + backoffLimit: 1 + template: + spec: + restartPolicy: Never + + containers: + - name: star + image: quay.io/biocontainers/star:2.7.10a--h9ee0642_0 + command: ["/bin/sh", "-c"] + args: + - | + set -e + + echo "=== Aligning ERR458493 (WT) ===" + mkdir -p /data/aligned/ERR458493 + STAR \ + --runMode alignReads \ + --genomeDir /data/reference/star_index \ + --readFilesIn /data/raw/ERR458493.fastq.gz \ + --readFilesCommand zcat \ + --outSAMtype BAM SortedByCoordinate \ + --outSAMattributes NH HI AS NM \ + --outFileNamePrefix /data/aligned/ERR458493/ \ + --runThreadN 4 + + echo "=== Aligning ERR458500 (snf2 mutant) ===" + mkdir -p /data/aligned/ERR458500 + STAR \ + --runMode alignReads \ + --genomeDir /data/reference/star_index \ + --readFilesIn /data/raw/ERR458500.fastq.gz \ + --readFilesCommand zcat \ + --outSAMtype BAM SortedByCoordinate \ + --outSAMattributes NH HI AS NM \ + --outFileNamePrefix /data/aligned/ERR458500/ \ + --runThreadN 4 + + echo "=== Done ===" + ls -lh /data/aligned/ERR458493/ + ls -lh /data/aligned/ERR458500/ + resources: + requests: + memory: "6Gi" + cpu: "4" + limits: + memory: "10Gi" + cpu: "4" + volumeMounts: + - name: workspace + mountPath: /data + + volumes: + - name: workspace + persistentVolumeClaim: + claimName: rnaseq-workspace diff --git a/04b-job-star-extra.yaml b/04b-job-star-extra.yaml new file mode 100644 index 0000000..65036f2 --- /dev/null +++ b/04b-job-star-extra.yaml @@ -0,0 +1,52 @@ +# 04b-job-star-extra.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: star-align-extra + namespace: rnaseq +spec: + backoffLimit: 1 + template: + spec: + restartPolicy: Never + + containers: + - name: star + image: quay.io/biocontainers/star:2.7.10a--h9ee0642_0 + command: ["/bin/sh", "-c"] + args: + - | + set -e + for acc in ERR458494 ERR458495 ERR458501 ERR458502; do + echo "=== Aligning $acc ===" + mkdir -p /data/aligned/$acc + STAR \ + --runMode alignReads \ + --genomeDir /data/reference/star_index \ + --readFilesIn /data/raw/$acc.fastq.gz \ + --readFilesCommand zcat \ + --outSAMtype BAM SortedByCoordinate \ + --outSAMattributes NH HI AS NM \ + --outFileNamePrefix /data/aligned/$acc/ \ + --runThreadN 4 + done + + echo "=== Done ===" + for acc in ERR458494 ERR458495 ERR458501 ERR458502; do + ls -lh /data/aligned/$acc/ + done + resources: + requests: + memory: "6Gi" + cpu: "4" + limits: + memory: "10Gi" + cpu: "4" + volumeMounts: + - name: workspace + mountPath: /data + + volumes: + - name: workspace + persistentVolumeClaim: + claimName: rnaseq-workspace diff --git a/05-job-featurecounts.yaml b/05-job-featurecounts.yaml new file mode 100644 index 0000000..23edfcc --- /dev/null +++ b/05-job-featurecounts.yaml @@ -0,0 +1,48 @@ +# 05-job-featurecounts.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: featurecounts-v2 + namespace: rnaseq +spec: + backoffLimit: 2 + template: + spec: + restartPolicy: Never + + containers: + - name: featurecounts + image: quay.io/biocontainers/subread:2.0.6--he4a0461_0 + command: ["/bin/sh", "-c"] + args: + - | + set -e + echo "=== Running featureCounts on 6 samples ===" + featureCounts \ + -a /data/reference/annotation.gtf \ + -o /data/counts/gene_counts.txt \ + -T 4 \ + /data/aligned/ERR458493/Aligned.sortedByCoord.out.bam \ + /data/aligned/ERR458494/Aligned.sortedByCoord.out.bam \ + /data/aligned/ERR458495/Aligned.sortedByCoord.out.bam \ + /data/aligned/ERR458500/Aligned.sortedByCoord.out.bam \ + /data/aligned/ERR458501/Aligned.sortedByCoord.out.bam \ + /data/aligned/ERR458502/Aligned.sortedByCoord.out.bam + + echo "=== Done ===" + cat /data/counts/gene_counts.txt.summary + resources: + requests: + memory: "1Gi" + cpu: "2" + limits: + memory: "2Gi" + cpu: "4" + volumeMounts: + - name: workspace + mountPath: /data + + volumes: + - name: workspace + persistentVolumeClaim: + claimName: rnaseq-workspace diff --git a/06-job-deseq2.yaml b/06-job-deseq2.yaml new file mode 100644 index 0000000..f63d949 --- /dev/null +++ b/06-job-deseq2.yaml @@ -0,0 +1,69 @@ +# 06-job-deseq2.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: deseq2-v2 + namespace: rnaseq +spec: + backoffLimit: 1 + template: + spec: + restartPolicy: Never + + containers: + - name: deseq2 + image: quay.io/biocontainers/bioconductor-deseq2:1.46.0--r44he5774e6_1 + command: ["/bin/sh", "-c"] + args: + - | + set -e + cat <<'EOF' > /tmp/deseq2_analysis.R + library(DESeq2) + + counts_raw <- read.table("/data/counts/gene_counts.txt", + header = TRUE, skip = 1, row.names = 1) + + counts <- counts_raw[, 6:11] + colnames(counts) <- c("WT_1", "WT_2", "WT_3", + "SNF2_1", "SNF2_2", "SNF2_3") + + coldata <- data.frame( + condition = factor(c("WT", "WT", "WT", + "SNF2", "SNF2", "SNF2"), + levels = c("WT", "SNF2")) + ) + rownames(coldata) <- colnames(counts) + + dds <- DESeqDataSetFromMatrix(countData = counts, + colData = coldata, + design = ~ condition) + + dds <- DESeq(dds) + res <- results(dds) + res <- res[order(res$padj), ] + + write.csv(as.data.frame(res), "/data/results/deseq2_results.csv") + + cat("=== Summary ===\n") + summary(res) + + cat("\n=== Top 10 genes by adjusted p-value ===\n") + print(head(as.data.frame(res), 10)) + EOF + + Rscript /tmp/deseq2_analysis.R + resources: + requests: + memory: "2Gi" + cpu: "2" + limits: + memory: "4Gi" + cpu: "4" + volumeMounts: + - name: workspace + mountPath: /data + + volumes: + - name: workspace + persistentVolumeClaim: + claimName: rnaseq-workspace diff --git a/README.md b/README.md new file mode 100644 index 0000000..7b5d8c5 --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +# RNA-seq DESeq2 Pipeline + +A sequential Kubernetes Job pipeline for differential expression analysis on yeast RNA-seq data. Each stage runs as a one-shot Job against a shared PVC, in order. + +## Dataset + +- Source: Gierliński et al., ENA accession PRJEB5348 +- Reads: 50bp single-end +- Conditions: wild-type (WT: ERR458493–495) vs. snf2 deletion mutant (snf2: ERR458500–502) + +## Pipeline stages + +| Order | File | Stage | +|---|---|---| +| 1 | `01-pvc.yaml` | Shared PersistentVolumeClaim for pipeline data and intermediate files | +| 2 | `02-job-sra-download.yaml` | Downloads raw FASTQ reads from SRA/ENA | +| 2b | `02b-job-sra-download-extra.yaml` | Downloads the remaining replicate samples | +| 3 | `03-job-fastqc.yaml` | FastQC read quality control | +| 4 | `04-job-star.yaml` | STAR alignment to the reference genome | +| 4b | `04b-job-star-extra.yaml` | STAR alignment for the remaining replicate samples | +| 5 | `05-job-featurecounts.yaml` | Gene-level count matrix from aligned reads | +| 6 | `06-job-deseq2.yaml` | DESeq2 differential expression analysis (WT vs. snf2) | + +## Results + +- STAR alignment: ~85–90% mapping rate across samples +- DESeq2 output visualized (volcano plot, etc.) in a Jupyter R notebook + +## Running + +Namespace: `rnaseq`. Jobs are sequential — each depends on the previous stage's output landing on the shared PVC, so apply and wait for completion before moving to the next: + +```bash +kubectl apply -f 01-pvc.yaml +kubectl apply -f 02-job-sra-download.yaml +kubectl get jobs -n rnaseq -w # wait for Completed before continuing +kubectl apply -f 02b-job-sra-download-extra.yaml +kubectl apply -f 03-job-fastqc.yaml +kubectl apply -f 04-job-star.yaml +kubectl apply -f 04b-job-star-extra.yaml +kubectl apply -f 05-job-featurecounts.yaml +kubectl apply -f 06-job-deseq2.yaml +```