Purpose
Reprocess Apul WGBS data to see if it matches the output Sam and Steven got.
Results
-
multiqc: multiqc_report.html
-
.cov.gz files: https://gannet.fish.washington.edu/metacarcinus/E5/20250714_methylseq_Apul/bismark/methylation_calls/methylation_coverage/
-
methylseq output directory: https://gannet.fish.washington.edu/metacarcinus/E5/20250714_methylseq_Apul/
- 10x CpG matrix: https://gannet.fish.washington.edu/metacarcinus/E5/20250728_meth_Apul/merged-WGBS-CpG-counts.csv
- 10x CpG matrix filtered for coverage in all samples: https://gannet.fish.washington.edu/metacarcinus/E5/20250728_meth_Apul/merged-WGBS-CpG-counts_filtered.csv
Methods
Copy Apul genome to Klone
cd /gsscratch/srlab/strigg/GENOMES
wget https://gannet.fish.washington.edu/seashell/bu-github/deep-dive-expression/D-Apul/data/Apulcra-genome.fa
Copy Apul WGBS fastqs to Klone
mkdir /gscratch/scrubbed/strigg/analyses/20250714_methylseq/20250714_methylseq_Apul
cd /gscratch/scrubbed/strigg/analyses/20250714_methylseq/20250714_methylseq_Apul
rsync --archive --verbose --progress --include='*/' --include='*.fq.gz' --exclude='*' shellytrigg@gannet.fish.washington.edu:/volume2/web/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/01.20-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/ .
Reprocess data with methylseq pipeline
nextflow run nf-core/methylseq \
-c /gscratch/srlab/strigg/bin/uw_hyak_srlab.config \
--input /gscratch/scrubbed/strigg/analyses/20250714_methylseq/20250714_methylseq_Apul/samplesheet.csv \
--outdir /gscratch/scrubbed/strigg/analyses/20250714_methylseq/20250714_methylseq_Apul \
--fasta /gscratch/srlab/strigg/GENOMES/Apulcra-genome.fa \
--em_seq \
-resume \
-with-report nf_report.html \
-with-trace \
-with-timeline nf_timeline.html \
--skip_trimming \
--nomeseq
copy output to gannet
rsync --archive --verbose --progress --include='*/' --exclude='*.gz' --exclude='*.bam' --exclude='BismarkIndex' --exclude=data/ 20250714_methylseq_Apul shellytrigg@gannet.fish.washington.edu:/volume2/web/metacarcinus/E5/
rsync --archive --verbose --progress bismark shellytrigg@gannet.fish.washington.edu:/volume2/web/metacarcinus/E5/20250714_methylseq_Apul
create 10x CpG matrix
- SLURM: 20250728_mergeCov_Apul.sh
#!/bin/bash
## Job Name
#SBATCH --job-name=20250728_mergeCov
## Allocation Definition
#SBATCH --account=coenv
#SBATCH --partition=cpu-g2
## Resources
## Nodes
#SBATCH --nodes=1
## Walltime (days-hours:minutes:seconds format)
#SBATCH --time=1-00:00:00
## Memory per node
#SBATCH --mem=300G
##turn on e-mail notification
#SBATCH --mail-type=ALL
#SBATCH --mail-user=strigg@uw.edu
## Specify the working directory for this job
#SBATCH --chdir=/gscratch/scrubbed/strigg/analyses/20250728_meth_Apul
%%bash
set -ex
# make bed file from cov file keeping only CpGs w. 10x cov
for f in /gscratch/scrubbed/strigg/analyses/20250714_methylseq/20250714_methylseq_Apul/bismark/methylation_calls/methylation_coverage/*.fastp-trim_bismark_bt2_pe.deduplicated.bismark.cov.gz
do
STEM=$(basename "${f}") # Get the entire filename including the long suffix
STEM="${STEM%.POR-*.fastp-trim_bismark_bt2_pe.deduplicated.bismark.cov.gz}" # Remove the suffix using parameter expansion
zcat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 10) {print $1, $2, $3, $4}}' \
> "${STEM}"_10x.bedgraph
done
# create modified tables with two columns; one is the CpG ID which is merged chrom and start site; one is the %meth
for file in *10x.bedgraph; do
awk -F"\t" -v fname="${file%_10x*}" 'BEGIN {print "CpG\t" fname}{print "CpG_"_$1"_"$2"\t"$4}' "$file" > "${file%.bedgraph}_processed.txt"
done
python /gscratch/srlab/strigg/scripts/merge_processed_txt.py
python script merge_processed_txt.py
copy output to gannet
rsync --progress --verbose --archive 20250728_meth_Apul shellytrigg@gannet.fish.washington.edu:/volume2/web/metacarcinus/E5