Initial RNA-seq DESeq2 pipeline manifests

2026-06-21 06:39:53 -04:00
commit 395c18a9e6
9 changed files with 431 additions and 0 deletions
@@ -0,0 +1,13 @@
 # 01-pvc.yaml
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: rnaseq-workspace
  namespace: rnaseq
 spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: longhorn
  resources:
    requests:
      storage: 30Gi
@@ -0,0 +1,59 @@
 # 02-job-sra-download.yaml
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: sra-download
  namespace: rnaseq
 spec:
  backoffLimit: 2
  template:
    spec:
      restartPolicy: Never
      initContainers:
        - name: download-sra
          image: ncbi/sra-tools:3.0.0
          command: ["/bin/sh", "-c"]
          args:
            - |
              set -e
              mkdir -p /data/raw
              echo "=== Downloading ERR458493 (WT, single-end) ==="
              fasterq-dump ERR458493 --outdir /data/raw --temp /tmp --threads 4
              echo "=== Downloading ERR458500 (snf2 mutant, single-end) ==="
              fasterq-dump ERR458500 --outdir /data/raw --temp /tmp --threads 4
              echo "=== Compressing ==="
              gzip /data/raw/*.fastq
              echo "=== Done ==="
              ls -lh /data/raw/
          resources:
            requests:
              memory: "1Gi"
              cpu: "2"
            limits:
              memory: "2Gi"
              cpu: "4"
          volumeMounts:
            - name: workspace
              mountPath: /data
      containers:
        - name: verify
          image: busybox:1.36
          command: ["/bin/sh", "-c"]
          args:
            - |
              echo "=== Raw files ==="
              ls -lh /data/raw/
          volumeMounts:
            - name: workspace
              mountPath: /data
      volumes:
        - name: workspace
          persistentVolumeClaim:
            claimName: rnaseq-workspace
@@ -0,0 +1,42 @@
 # 02b-job-sra-download-extra.yaml
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: sra-download-extra
  namespace: rnaseq
 spec:
  backoffLimit: 2
  template:
    spec:
      restartPolicy: Never
      initContainers:
        - name: download-sra
          image: ncbi/sra-tools:3.0.0
          command: ["/bin/sh", "-c"]
          args:
            - |
              set -e
              for acc in ERR458494 ERR458495 ERR458501 ERR458502; do
                echo "=== Downloading $acc ==="
                fasterq-dump $acc --outdir /data/raw --temp /tmp --threads 4
              done
              gzip /data/raw/*.fastq
              ls -lh /data/raw/
          resources:
            requests: { memory: "1Gi", cpu: "2" }
            limits: { memory: "2Gi", cpu: "4" }
          volumeMounts:
            - name: workspace
              mountPath: /data
      containers:
        - name: verify
          image: busybox:1.36
          command: ["/bin/sh", "-c"]
          args: ["ls -lh /data/raw/"]
          volumeMounts:
            - name: workspace
              mountPath: /data
      volumes:
        - name: workspace
          persistentVolumeClaim:
            claimName: rnaseq-workspace
@@ -0,0 +1,43 @@
 # 03-job-fastqc.yaml
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: fastqc
  namespace: rnaseq
 spec:
  backoffLimit: 2
  template:
    spec:
      restartPolicy: Never
      containers:
        - name: fastqc
          image: biocontainers/fastqc:v0.11.9_cv8
          command: ["/bin/sh", "-c"]
          args:
            - |
              set -e
              echo "=== Running FastQC on all samples ==="
              fastqc \
                /data/raw/ERR458493.fastq.gz \
                /data/raw/ERR458500.fastq.gz \
                --outdir /data/fastqc \
                --threads 4
              echo "=== Done ==="
              ls -lh /data/fastqc/
          resources:
            requests:
              memory: "1Gi"
              cpu: "2"
            limits:
              memory: "2Gi"
              cpu: "4"
          volumeMounts:
            - name: workspace
              mountPath: /data
      volumes:
        - name: workspace
          persistentVolumeClaim:
            claimName: rnaseq-workspace
@@ -0,0 +1,62 @@
 # 04-job-star.yaml
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: star-align
  namespace: rnaseq
 spec:
  backoffLimit: 1
  template:
    spec:
      restartPolicy: Never
      containers:
        - name: star
          image: quay.io/biocontainers/star:2.7.10a--h9ee0642_0
          command: ["/bin/sh", "-c"]
          args:
            - |
              set -e
              echo "=== Aligning ERR458493 (WT) ==="
              mkdir -p /data/aligned/ERR458493
              STAR \
                --runMode alignReads \
                --genomeDir /data/reference/star_index \
                --readFilesIn /data/raw/ERR458493.fastq.gz \
                --readFilesCommand zcat \
                --outSAMtype BAM SortedByCoordinate \
                --outSAMattributes NH HI AS NM \
                --outFileNamePrefix /data/aligned/ERR458493/ \
                --runThreadN 4
              echo "=== Aligning ERR458500 (snf2 mutant) ==="
              mkdir -p /data/aligned/ERR458500
              STAR \
                --runMode alignReads \
                --genomeDir /data/reference/star_index \
                --readFilesIn /data/raw/ERR458500.fastq.gz \
                --readFilesCommand zcat \
                --outSAMtype BAM SortedByCoordinate \
                --outSAMattributes NH HI AS NM \
                --outFileNamePrefix /data/aligned/ERR458500/ \
                --runThreadN 4
              echo "=== Done ==="
              ls -lh /data/aligned/ERR458493/
              ls -lh /data/aligned/ERR458500/
          resources:
            requests:
              memory: "6Gi"
              cpu: "4"
            limits:
              memory: "10Gi"
              cpu: "4"
          volumeMounts:
            - name: workspace
              mountPath: /data
      volumes:
        - name: workspace
          persistentVolumeClaim:
            claimName: rnaseq-workspace
@@ -0,0 +1,52 @@
 # 04b-job-star-extra.yaml
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: star-align-extra
  namespace: rnaseq
 spec:
  backoffLimit: 1
  template:
    spec:
      restartPolicy: Never
      containers:
        - name: star
          image: quay.io/biocontainers/star:2.7.10a--h9ee0642_0
          command: ["/bin/sh", "-c"]
          args:
            - |
              set -e
              for acc in ERR458494 ERR458495 ERR458501 ERR458502; do
                echo "=== Aligning $acc ==="
                mkdir -p /data/aligned/$acc
                STAR \
                  --runMode alignReads \
                  --genomeDir /data/reference/star_index \
                  --readFilesIn /data/raw/$acc.fastq.gz \
                  --readFilesCommand zcat \
                  --outSAMtype BAM SortedByCoordinate \
                  --outSAMattributes NH HI AS NM \
                  --outFileNamePrefix /data/aligned/$acc/ \
                  --runThreadN 4
              done
              echo "=== Done ==="
              for acc in ERR458494 ERR458495 ERR458501 ERR458502; do
                ls -lh /data/aligned/$acc/
              done
          resources:
            requests:
              memory: "6Gi"
              cpu: "4"
            limits:
              memory: "10Gi"
              cpu: "4"
          volumeMounts:
            - name: workspace
              mountPath: /data
      volumes:
        - name: workspace
          persistentVolumeClaim:
            claimName: rnaseq-workspace
@@ -0,0 +1,48 @@
 # 05-job-featurecounts.yaml
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: featurecounts-v2
  namespace: rnaseq
 spec:
  backoffLimit: 2
  template:
    spec:
      restartPolicy: Never
      containers:
        - name: featurecounts
          image: quay.io/biocontainers/subread:2.0.6--he4a0461_0
          command: ["/bin/sh", "-c"]
          args:
            - |
              set -e
              echo "=== Running featureCounts on 6 samples ==="
              featureCounts \
                -a /data/reference/annotation.gtf \
                -o /data/counts/gene_counts.txt \
                -T 4 \
                /data/aligned/ERR458493/Aligned.sortedByCoord.out.bam \
                /data/aligned/ERR458494/Aligned.sortedByCoord.out.bam \
                /data/aligned/ERR458495/Aligned.sortedByCoord.out.bam \
                /data/aligned/ERR458500/Aligned.sortedByCoord.out.bam \
                /data/aligned/ERR458501/Aligned.sortedByCoord.out.bam \
                /data/aligned/ERR458502/Aligned.sortedByCoord.out.bam
              echo "=== Done ==="
              cat /data/counts/gene_counts.txt.summary
          resources:
            requests:
              memory: "1Gi"
              cpu: "2"
            limits:
              memory: "2Gi"
              cpu: "4"
          volumeMounts:
            - name: workspace
              mountPath: /data
      volumes:
        - name: workspace
          persistentVolumeClaim:
            claimName: rnaseq-workspace
@@ -0,0 +1,69 @@
 # 06-job-deseq2.yaml
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: deseq2-v2
  namespace: rnaseq
 spec:
  backoffLimit: 1
  template:
    spec:
      restartPolicy: Never
      containers:
        - name: deseq2
          image: quay.io/biocontainers/bioconductor-deseq2:1.46.0--r44he5774e6_1
          command: ["/bin/sh", "-c"]
          args:
            - |
              set -e
              cat <<'EOF' > /tmp/deseq2_analysis.R
              library(DESeq2)
              counts_raw <- read.table("/data/counts/gene_counts.txt",
                                        header = TRUE, skip = 1, row.names = 1)
              counts <- counts_raw[, 6:11]
              colnames(counts) <- c("WT_1", "WT_2", "WT_3",
                                     "SNF2_1", "SNF2_2", "SNF2_3")
              coldata <- data.frame(
                condition = factor(c("WT", "WT", "WT",
                                      "SNF2", "SNF2", "SNF2"),
                                    levels = c("WT", "SNF2"))
              )
              rownames(coldata) <- colnames(counts)
              dds <- DESeqDataSetFromMatrix(countData = counts,
                                             colData = coldata,
                                             design = ~ condition)
              dds <- DESeq(dds)
              res <- results(dds)
              res <- res[order(res$padj), ]
              write.csv(as.data.frame(res), "/data/results/deseq2_results.csv")
              cat("=== Summary ===\n")
              summary(res)
              cat("\n=== Top 10 genes by adjusted p-value ===\n")
              print(head(as.data.frame(res), 10))
              EOF
              Rscript /tmp/deseq2_analysis.R
          resources:
            requests:
              memory: "2Gi"
              cpu: "2"
            limits:
              memory: "4Gi"
              cpu: "4"
          volumeMounts:
            - name: workspace
              mountPath: /data
      volumes:
        - name: workspace
          persistentVolumeClaim:
            claimName: rnaseq-workspace
@@ -0,0 +1,43 @@
 # RNA-seq DESeq2 Pipeline
 A sequential Kubernetes Job pipeline for differential expression analysis on yeast RNA-seq data. Each stage runs as a one-shot Job against a shared PVC, in order.
 ## Dataset
 - Source: Gierliński et al., ENA accession PRJEB5348
 - Reads: 50bp single-end
 - Conditions: wild-type (WT: ERR458493–495) vs. snf2 deletion mutant (snf2: ERR458500–502)
 ## Pipeline stages
 | Order | File | Stage |
 |---|---|---|
 | 1 | `01-pvc.yaml` | Shared PersistentVolumeClaim for pipeline data and intermediate files |
 | 2 | `02-job-sra-download.yaml` | Downloads raw FASTQ reads from SRA/ENA |
 | 2b | `02b-job-sra-download-extra.yaml` | Downloads the remaining replicate samples |
 | 3 | `03-job-fastqc.yaml` | FastQC read quality control |
 | 4 | `04-job-star.yaml` | STAR alignment to the reference genome |
 | 4b | `04b-job-star-extra.yaml` | STAR alignment for the remaining replicate samples |
 | 5 | `05-job-featurecounts.yaml` | Gene-level count matrix from aligned reads |
 | 6 | `06-job-deseq2.yaml` | DESeq2 differential expression analysis (WT vs. snf2) |
 ## Results
 - STAR alignment: ~85–90% mapping rate across samples
 - DESeq2 output visualized (volcano plot, etc.) in a Jupyter R notebook
 ## Running
 Namespace: `rnaseq`. Jobs are sequential — each depends on the previous stage's output landing on the shared PVC, so apply and wait for completion before moving to the next:
 ```bash
 kubectl apply -f 01-pvc.yaml
 kubectl apply -f 02-job-sra-download.yaml
 kubectl get jobs -n rnaseq -w          # wait for Completed before continuing
 kubectl apply -f 02b-job-sra-download-extra.yaml
 kubectl apply -f 03-job-fastqc.yaml
 kubectl apply -f 04-job-star.yaml
 kubectl apply -f 04b-job-star-extra.yaml
 kubectl apply -f 05-job-featurecounts.yaml
 kubectl apply -f 06-job-deseq2.yaml
 ```