Initial RNA-seq DESeq2 pipeline manifests

This commit is contained in:
2026-06-21 06:39:53 -04:00
commit 395c18a9e6
9 changed files with 431 additions and 0 deletions
+13
View File
@@ -0,0 +1,13 @@
# 01-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: rnaseq-workspace
namespace: rnaseq
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
resources:
requests:
storage: 30Gi
+59
View File
@@ -0,0 +1,59 @@
# 02-job-sra-download.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: sra-download
namespace: rnaseq
spec:
backoffLimit: 2
template:
spec:
restartPolicy: Never
initContainers:
- name: download-sra
image: ncbi/sra-tools:3.0.0
command: ["/bin/sh", "-c"]
args:
- |
set -e
mkdir -p /data/raw
echo "=== Downloading ERR458493 (WT, single-end) ==="
fasterq-dump ERR458493 --outdir /data/raw --temp /tmp --threads 4
echo "=== Downloading ERR458500 (snf2 mutant, single-end) ==="
fasterq-dump ERR458500 --outdir /data/raw --temp /tmp --threads 4
echo "=== Compressing ==="
gzip /data/raw/*.fastq
echo "=== Done ==="
ls -lh /data/raw/
resources:
requests:
memory: "1Gi"
cpu: "2"
limits:
memory: "2Gi"
cpu: "4"
volumeMounts:
- name: workspace
mountPath: /data
containers:
- name: verify
image: busybox:1.36
command: ["/bin/sh", "-c"]
args:
- |
echo "=== Raw files ==="
ls -lh /data/raw/
volumeMounts:
- name: workspace
mountPath: /data
volumes:
- name: workspace
persistentVolumeClaim:
claimName: rnaseq-workspace
+42
View File
@@ -0,0 +1,42 @@
# 02b-job-sra-download-extra.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: sra-download-extra
namespace: rnaseq
spec:
backoffLimit: 2
template:
spec:
restartPolicy: Never
initContainers:
- name: download-sra
image: ncbi/sra-tools:3.0.0
command: ["/bin/sh", "-c"]
args:
- |
set -e
for acc in ERR458494 ERR458495 ERR458501 ERR458502; do
echo "=== Downloading $acc ==="
fasterq-dump $acc --outdir /data/raw --temp /tmp --threads 4
done
gzip /data/raw/*.fastq
ls -lh /data/raw/
resources:
requests: { memory: "1Gi", cpu: "2" }
limits: { memory: "2Gi", cpu: "4" }
volumeMounts:
- name: workspace
mountPath: /data
containers:
- name: verify
image: busybox:1.36
command: ["/bin/sh", "-c"]
args: ["ls -lh /data/raw/"]
volumeMounts:
- name: workspace
mountPath: /data
volumes:
- name: workspace
persistentVolumeClaim:
claimName: rnaseq-workspace
+43
View File
@@ -0,0 +1,43 @@
# 03-job-fastqc.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: fastqc
namespace: rnaseq
spec:
backoffLimit: 2
template:
spec:
restartPolicy: Never
containers:
- name: fastqc
image: biocontainers/fastqc:v0.11.9_cv8
command: ["/bin/sh", "-c"]
args:
- |
set -e
echo "=== Running FastQC on all samples ==="
fastqc \
/data/raw/ERR458493.fastq.gz \
/data/raw/ERR458500.fastq.gz \
--outdir /data/fastqc \
--threads 4
echo "=== Done ==="
ls -lh /data/fastqc/
resources:
requests:
memory: "1Gi"
cpu: "2"
limits:
memory: "2Gi"
cpu: "4"
volumeMounts:
- name: workspace
mountPath: /data
volumes:
- name: workspace
persistentVolumeClaim:
claimName: rnaseq-workspace
+62
View File
@@ -0,0 +1,62 @@
# 04-job-star.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: star-align
namespace: rnaseq
spec:
backoffLimit: 1
template:
spec:
restartPolicy: Never
containers:
- name: star
image: quay.io/biocontainers/star:2.7.10a--h9ee0642_0
command: ["/bin/sh", "-c"]
args:
- |
set -e
echo "=== Aligning ERR458493 (WT) ==="
mkdir -p /data/aligned/ERR458493
STAR \
--runMode alignReads \
--genomeDir /data/reference/star_index \
--readFilesIn /data/raw/ERR458493.fastq.gz \
--readFilesCommand zcat \
--outSAMtype BAM SortedByCoordinate \
--outSAMattributes NH HI AS NM \
--outFileNamePrefix /data/aligned/ERR458493/ \
--runThreadN 4
echo "=== Aligning ERR458500 (snf2 mutant) ==="
mkdir -p /data/aligned/ERR458500
STAR \
--runMode alignReads \
--genomeDir /data/reference/star_index \
--readFilesIn /data/raw/ERR458500.fastq.gz \
--readFilesCommand zcat \
--outSAMtype BAM SortedByCoordinate \
--outSAMattributes NH HI AS NM \
--outFileNamePrefix /data/aligned/ERR458500/ \
--runThreadN 4
echo "=== Done ==="
ls -lh /data/aligned/ERR458493/
ls -lh /data/aligned/ERR458500/
resources:
requests:
memory: "6Gi"
cpu: "4"
limits:
memory: "10Gi"
cpu: "4"
volumeMounts:
- name: workspace
mountPath: /data
volumes:
- name: workspace
persistentVolumeClaim:
claimName: rnaseq-workspace
+52
View File
@@ -0,0 +1,52 @@
# 04b-job-star-extra.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: star-align-extra
namespace: rnaseq
spec:
backoffLimit: 1
template:
spec:
restartPolicy: Never
containers:
- name: star
image: quay.io/biocontainers/star:2.7.10a--h9ee0642_0
command: ["/bin/sh", "-c"]
args:
- |
set -e
for acc in ERR458494 ERR458495 ERR458501 ERR458502; do
echo "=== Aligning $acc ==="
mkdir -p /data/aligned/$acc
STAR \
--runMode alignReads \
--genomeDir /data/reference/star_index \
--readFilesIn /data/raw/$acc.fastq.gz \
--readFilesCommand zcat \
--outSAMtype BAM SortedByCoordinate \
--outSAMattributes NH HI AS NM \
--outFileNamePrefix /data/aligned/$acc/ \
--runThreadN 4
done
echo "=== Done ==="
for acc in ERR458494 ERR458495 ERR458501 ERR458502; do
ls -lh /data/aligned/$acc/
done
resources:
requests:
memory: "6Gi"
cpu: "4"
limits:
memory: "10Gi"
cpu: "4"
volumeMounts:
- name: workspace
mountPath: /data
volumes:
- name: workspace
persistentVolumeClaim:
claimName: rnaseq-workspace
+48
View File
@@ -0,0 +1,48 @@
# 05-job-featurecounts.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: featurecounts-v2
namespace: rnaseq
spec:
backoffLimit: 2
template:
spec:
restartPolicy: Never
containers:
- name: featurecounts
image: quay.io/biocontainers/subread:2.0.6--he4a0461_0
command: ["/bin/sh", "-c"]
args:
- |
set -e
echo "=== Running featureCounts on 6 samples ==="
featureCounts \
-a /data/reference/annotation.gtf \
-o /data/counts/gene_counts.txt \
-T 4 \
/data/aligned/ERR458493/Aligned.sortedByCoord.out.bam \
/data/aligned/ERR458494/Aligned.sortedByCoord.out.bam \
/data/aligned/ERR458495/Aligned.sortedByCoord.out.bam \
/data/aligned/ERR458500/Aligned.sortedByCoord.out.bam \
/data/aligned/ERR458501/Aligned.sortedByCoord.out.bam \
/data/aligned/ERR458502/Aligned.sortedByCoord.out.bam
echo "=== Done ==="
cat /data/counts/gene_counts.txt.summary
resources:
requests:
memory: "1Gi"
cpu: "2"
limits:
memory: "2Gi"
cpu: "4"
volumeMounts:
- name: workspace
mountPath: /data
volumes:
- name: workspace
persistentVolumeClaim:
claimName: rnaseq-workspace
+69
View File
@@ -0,0 +1,69 @@
# 06-job-deseq2.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: deseq2-v2
namespace: rnaseq
spec:
backoffLimit: 1
template:
spec:
restartPolicy: Never
containers:
- name: deseq2
image: quay.io/biocontainers/bioconductor-deseq2:1.46.0--r44he5774e6_1
command: ["/bin/sh", "-c"]
args:
- |
set -e
cat <<'EOF' > /tmp/deseq2_analysis.R
library(DESeq2)
counts_raw <- read.table("/data/counts/gene_counts.txt",
header = TRUE, skip = 1, row.names = 1)
counts <- counts_raw[, 6:11]
colnames(counts) <- c("WT_1", "WT_2", "WT_3",
"SNF2_1", "SNF2_2", "SNF2_3")
coldata <- data.frame(
condition = factor(c("WT", "WT", "WT",
"SNF2", "SNF2", "SNF2"),
levels = c("WT", "SNF2"))
)
rownames(coldata) <- colnames(counts)
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = coldata,
design = ~ condition)
dds <- DESeq(dds)
res <- results(dds)
res <- res[order(res$padj), ]
write.csv(as.data.frame(res), "/data/results/deseq2_results.csv")
cat("=== Summary ===\n")
summary(res)
cat("\n=== Top 10 genes by adjusted p-value ===\n")
print(head(as.data.frame(res), 10))
EOF
Rscript /tmp/deseq2_analysis.R
resources:
requests:
memory: "2Gi"
cpu: "2"
limits:
memory: "4Gi"
cpu: "4"
volumeMounts:
- name: workspace
mountPath: /data
volumes:
- name: workspace
persistentVolumeClaim:
claimName: rnaseq-workspace
+43
View File
@@ -0,0 +1,43 @@
# RNA-seq DESeq2 Pipeline
A sequential Kubernetes Job pipeline for differential expression analysis on yeast RNA-seq data. Each stage runs as a one-shot Job against a shared PVC, in order.
## Dataset
- Source: Gierliński et al., ENA accession PRJEB5348
- Reads: 50bp single-end
- Conditions: wild-type (WT: ERR458493495) vs. snf2 deletion mutant (snf2: ERR458500502)
## Pipeline stages
| Order | File | Stage |
|---|---|---|
| 1 | `01-pvc.yaml` | Shared PersistentVolumeClaim for pipeline data and intermediate files |
| 2 | `02-job-sra-download.yaml` | Downloads raw FASTQ reads from SRA/ENA |
| 2b | `02b-job-sra-download-extra.yaml` | Downloads the remaining replicate samples |
| 3 | `03-job-fastqc.yaml` | FastQC read quality control |
| 4 | `04-job-star.yaml` | STAR alignment to the reference genome |
| 4b | `04b-job-star-extra.yaml` | STAR alignment for the remaining replicate samples |
| 5 | `05-job-featurecounts.yaml` | Gene-level count matrix from aligned reads |
| 6 | `06-job-deseq2.yaml` | DESeq2 differential expression analysis (WT vs. snf2) |
## Results
- STAR alignment: ~8590% mapping rate across samples
- DESeq2 output visualized (volcano plot, etc.) in a Jupyter R notebook
## Running
Namespace: `rnaseq`. Jobs are sequential — each depends on the previous stage's output landing on the shared PVC, so apply and wait for completion before moving to the next:
```bash
kubectl apply -f 01-pvc.yaml
kubectl apply -f 02-job-sra-download.yaml
kubectl get jobs -n rnaseq -w # wait for Completed before continuing
kubectl apply -f 02b-job-sra-download-extra.yaml
kubectl apply -f 03-job-fastqc.yaml
kubectl apply -f 04-job-star.yaml
kubectl apply -f 04b-job-star-extra.yaml
kubectl apply -f 05-job-featurecounts.yaml
kubectl apply -f 06-job-deseq2.yaml
```