Alignment and mapping workflow

1. Prepare your data

1.1 download files to from your service agent

(base) qianjianghu@QJ-Ubuntu:/media/qianjianghu/Udata/DataAnaDriver/RNA_seq/Mareike$ wget -r -np -nH -R *.html -c --user webreader --password m-sbs2+78=AN1sa http://xena.lechnerlab.de:3080/bertrams/NexusDNA_GSK/Bulk-RNA%20sequencing_03032023/

1.2 upload data to Pitt CRC server

# create folders in Pitt CRC server
[huqj@login0b ~]$ cd /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS
[huqj@login0b ML_mouse_PCLS]$
[huqj@login0b ML_mouse_PCLS]$ mkdir Counts  Data  Jobs  Mapping  QC
[huqj@login0b ML_mouse_PCLS]$ ls
Counts  Data  Jobs  Mapping  QC
# upload the data to Pitt CRC server
# from local Ubuntu terminal
(base) qianjianghu@QJ-Ubuntu:/media/qianjianghu/Udata/DataAnaDriver/RNA_seq/Mareike/Mareike_mouse_PCLS$ scp -r ./ huqj@htc.crc.pitt.edu:/bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/Data/raw/

1.3 check md5sum

[huqj@login0b raw]$ grep -f  Mouse_PCLS.txt md5sums.txt | md5sum -c
./Sample_23L000907/23L000907_S5_L002_R2_001.fastq.gz: OK
./Sample_23L000907/23L000907_S5_L002_R1_001.fastq.gz: OK
./Sample_23L000907/23L000907_S5_L001_R1_001.fastq.gz: OK
./Sample_23L000907/23L000907_S5_L001_R2_001.fastq.gz: OK
./Sample_23L000914/23L000914_S12_L002_R1_001.fastq.gz: OK
./Sample_23L000914/23L000914_S12_L001_R2_001.fastq.gz: OK
./Sample_23L000905/23L000905_S3_L001_R1_001.fastq.gz: OK
./Sample_23L000914/23L000914_S12_L001_R1_001.fastq.gz: OK
./Sample_23L000913/23L000913_S11_L001_R2_001.fastq.gz: OK
./Sample_23L000905/23L000905_S3_L002_R1_001.fastq.gz: OK
./Sample_23L000905/23L000905_S3_L001_R2_001.fastq.gz: OK
./Sample_23L000905/23L000905_S3_L002_R2_001.fastq.gz: OK
./Sample_23L000914/23L000914_S12_L002_R2_001.fastq.gz: OK
./Sample_23L000913/23L000913_S11_L002_R2_001.fastq.gz: OK
./Sample_23L000909/23L000909_S7_L001_R1_001.fastq.gz: OK
./Sample_23L000913/23L000913_S11_L001_R1_001.fastq.gz: OK
./Sample_23L000909/23L000909_S7_L001_R2_001.fastq.gz: OK
./Sample_23L000913/23L000913_S11_L002_R1_001.fastq.gz: OK
./Sample_23L000909/23L000909_S7_L002_R2_001.fastq.gz: OK
./Sample_23L000909/23L000909_S7_L002_R1_001.fastq.gz: OK
./Sample_23L000920/23L000920_S18_L001_R2_001.fastq.gz: OK
./Sample_23L000920/23L000920_S18_L002_R1_001.fastq.gz: OK
./Sample_23L000920/23L000920_S18_L001_R1_001.fastq.gz: OK
./Sample_23L000920/23L000920_S18_L002_R2_001.fastq.gz: OK
./Sample_23L000903/23L000903_S1_L001_R1_001.fastq.gz: OK
./Sample_23L000903/23L000903_S1_L002_R1_001.fastq.gz: OK
./Sample_23L000903/23L000903_S1_L002_R2_001.fastq.gz: OK
./Sample_23L000904/23L000904_S2_L002_R2_001.fastq.gz: OK
./Sample_23L000911/23L000911_S9_L002_R1_001.fastq.gz: OK
./Sample_23L000910/23L000910_S8_L002_R1_001.fastq.gz: OK
./Sample_23L000903/23L000903_S1_L001_R2_001.fastq.gz: OK
./Sample_23L000904/23L000904_S2_L001_R2_001.fastq.gz: OK
./Sample_23L000910/23L000910_S8_L002_R2_001.fastq.gz: OK
./Sample_23L000911/23L000911_S9_L002_R2_001.fastq.gz: OK
./Sample_23L000911/23L000911_S9_L001_R1_001.fastq.gz: OK
./Sample_23L000904/23L000904_S2_L002_R1_001.fastq.gz: OK
./Sample_23L000904/23L000904_S2_L001_R1_001.fastq.gz: OK
./Sample_23L000910/23L000910_S8_L001_R2_001.fastq.gz: OK
./Sample_23L000910/23L000910_S8_L001_R1_001.fastq.gz: OK
./Sample_23L000911/23L000911_S9_L001_R2_001.fastq.gz: OK
./Sample_23L000908/23L000908_S6_L002_R1_001.fastq.gz: OK
./Sample_23L000919/23L000919_S17_L002_R2_001.fastq.gz: OK
./Sample_23L000919/23L000919_S17_L001_R1_001.fastq.gz: OK
./Sample_23L000908/23L000908_S6_L002_R2_001.fastq.gz: OK
./Sample_23L000908/23L000908_S6_L001_R1_001.fastq.gz: OK
./Sample_23L000908/23L000908_S6_L001_R2_001.fastq.gz: OK
./Sample_23L000912/23L000912_S10_L002_R1_001.fastq.gz: OK
./Sample_23L000919/23L000919_S17_L002_R1_001.fastq.gz: OK
./Sample_23L000912/23L000912_S10_L001_R2_001.fastq.gz: OK
./Sample_23L000906/23L000906_S4_L001_R1_001.fastq.gz: OK
./Sample_23L000906/23L000906_S4_L002_R2_001.fastq.gz: OK
./Sample_23L000906/23L000906_S4_L001_R2_001.fastq.gz: OK
./Sample_23L000906/23L000906_S4_L002_R1_001.fastq.gz: OK
./Sample_23L000919/23L000919_S17_L001_R2_001.fastq.gz: OK
./Sample_23L000912/23L000912_S10_L001_R1_001.fastq.gz: OK
./Sample_23L000912/23L000912_S10_L002_R2_001.fastq.gz: OK

2. Know more your samples

How the Library preparation was done, including library preparation kit, adapters, purification
sequencing platform, Sequencing length, depth, aimed reads,Paired-end or single-end sequencing reads, strand-specific or non-strand-specific

3. QC

3.1 Run FastQC

a. Move to FastQC folder under /Jobs and open fastqc.job using vim editor.

[huqj@login0b]$ cd /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/Jobs/FastQC
[huqj@login0b FastQC]$ vim fastqc.job
#!/bin/bash
#SBATCH -J fastqc
#SBATCH -c 12
#SBATCH -t 2:00:00
#SBATCH -o OUT/fastqc-%A_%a.out
#SBATCH --array=0-13 # job array index
#SBATCH --mail-type=END,FAIL
#SBATCH --mail-user=huqj@pitt.edu
###########

####### set-up fastqc
module load fastqc/0.11.7

set -x

################

project=/bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS

names=($(cat $project/Data/raw/Mouse_PCLS.txt))
echo ${names[${SLURM_ARRAY_TASK_ID}]}
sample=${names[${SLURM_ARRAY_TASK_ID}]}

fastq=$project/Data/merged_fastq
out=$project/QC/FastQC/Raw

#################

mkdir -p $out

########

fastqc -o $out $fastq/${sample}_1.fastq.gz
fastqc -o $out $fastq/${sample}_2.fastq.gz

check report

[huqj@login0b OUT]$ cd /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/Jobs/FastQC/OUT
[huqj@login0b OUT]$ grep "Analysis complete for" *.out
fastqc-1938004_0.out:Analysis complete for 23L000903_1.fastq.gz
fastqc-1938004_0.out:Analysis complete for 23L000903_2.fastq.gz
fastqc-1938004_10.out:Analysis complete for 23L000913_1.fastq.gz
fastqc-1938004_10.out:Analysis complete for 23L000913_2.fastq.gz
fastqc-1938004_11.out:Analysis complete for 23L000914_1.fastq.gz
fastqc-1938004_11.out:Analysis complete for 23L000914_2.fastq.gz
fastqc-1938004_12.out:Analysis complete for 23L000919_1.fastq.gz
fastqc-1938004_12.out:Analysis complete for 23L000919_2.fastq.gz
fastqc-1938004_13.out:Analysis complete for 23L000920_1.fastq.gz
fastqc-1938004_13.out:Analysis complete for 23L000920_2.fastq.gz
fastqc-1938004_1.out:Analysis complete for 23L000904_1.fastq.gz
fastqc-1938004_1.out:Analysis complete for 23L000904_2.fastq.gz
fastqc-1938004_2.out:Analysis complete for 23L000905_1.fastq.gz
fastqc-1938004_2.out:Analysis complete for 23L000905_2.fastq.gz
fastqc-1938004_3.out:Analysis complete for 23L000906_1.fastq.gz
fastqc-1938004_3.out:Analysis complete for 23L000906_2.fastq.gz
fastqc-1938004_4.out:Analysis complete for 23L000907_1.fastq.gz
fastqc-1938004_4.out:Analysis complete for 23L000907_2.fastq.gz
fastqc-1938004_5.out:Analysis complete for 23L000908_1.fastq.gz
fastqc-1938004_5.out:Analysis complete for 23L000908_2.fastq.gz
fastqc-1938004_6.out:Analysis complete for 23L000909_1.fastq.gz
fastqc-1938004_6.out:Analysis complete for 23L000909_2.fastq.gz
fastqc-1938004_7.out:Analysis complete for 23L000910_1.fastq.gz
fastqc-1938004_7.out:Analysis complete for 23L000910_2.fastq.gz
fastqc-1938004_8.out:Analysis complete for 23L000911_1.fastq.gz
fastqc-1938004_8.out:Analysis complete for 23L000911_2.fastq.gz
fastqc-1938004_9.out:Analysis complete for 23L000912_1.fastq.gz
fastqc-1938004_9.out:Analysis complete for 23L000912_2.fastq.gz

3.2 Run MultiQC

3.2.1 Move to the FastQC resultes folder

[huqj@login0b OUT]$ cd /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/
[huqj@login0b Raw]$ ls
23L000903_1_fastqc.html  23L000906_1_fastqc.html  23L000909_1_fastqc.html  23L000912_1_fastqc.html  23L000919_1_fastqc.html
23L000903_1_fastqc.zip   23L000906_1_fastqc.zip   23L000909_1_fastqc.zip   23L000912_1_fastqc.zip   23L000919_1_fastqc.zip
23L000903_2_fastqc.html  23L000906_2_fastqc.html  23L000909_2_fastqc.html  23L000912_2_fastqc.html  23L000919_2_fastqc.html
23L000903_2_fastqc.zip   23L000906_2_fastqc.zip   23L000909_2_fastqc.zip   23L000912_2_fastqc.zip   23L000919_2_fastqc.zip
23L000904_1_fastqc.html  23L000907_1_fastqc.html  23L000910_1_fastqc.html  23L000913_1_fastqc.html  23L000920_1_fastqc.html
23L000904_1_fastqc.zip   23L000907_1_fastqc.zip   23L000910_1_fastqc.zip   23L000913_1_fastqc.zip   23L000920_1_fastqc.zip
23L000904_2_fastqc.html  23L000907_2_fastqc.html  23L000910_2_fastqc.html  23L000913_2_fastqc.html  23L000920_2_fastqc.html
23L000904_2_fastqc.zip   23L000907_2_fastqc.zip   23L000910_2_fastqc.zip   23L000913_2_fastqc.zip   23L000920_2_fastqc.zip
23L000905_1_fastqc.html  23L000908_1_fastqc.html  23L000911_1_fastqc.html  23L000914_1_fastqc.html
23L000905_1_fastqc.zip   23L000908_1_fastqc.zip   23L000911_1_fastqc.zip   23L000914_1_fastqc.zip
23L000905_2_fastqc.html  23L000908_2_fastqc.html  23L000911_2_fastqc.html  23L000914_2_fastqc.html
23L000905_2_fastqc.zip   23L000908_2_fastqc.zip   23L000911_2_fastqc.zip   23L000914_2_fastqc.zip

3.2.2 load the MultiQC module on HTC

[huqj@login0b Raw]$ module spider multiqc

----------------------------------------------------------------------------
  multiqc:
----------------------------------------------------------------------------
    Description:
      Aggregate results from bioinformatics analyses across many samples
      into a single report.

     Versions:
        multiqc/1.7
        multiqc/1.8
        multiqc/1.10.1
        multiqc/1.12
        multiqc/1.13

----------------------------------------------------------------------------
  For detailed information about a specific "multiqc" module (including how to load the modules) use the module's full name.
  For example:

     $ module spider multiqc/1.8
----------------------------------------------------------------------------

[huqj@login0b Raw]$ module load multiqc/1.12
[huqj@login0b Raw]$

3.2.3 Run multiqc on the above FastQX files to summarize the resultes.

[huqj@login0b Raw]$ multiqc *.zip

  /// MultiQC 🔍 | v1.12

|           multiqc | MultiQC Version v1.14 now available!
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000903_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000903_2_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000904_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000904_2_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000905_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000905_2_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000906_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000906_2_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000907_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000907_2_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000908_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000908_2_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000909_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000909_2_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000910_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000910_2_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000911_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000911_2_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000912_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000912_2_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000913_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000913_2_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000914_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000914_2_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000919_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000919_2_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000920_1_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Raw/23L000920_2_fastqc.zip
|         searching | ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 28/28  
|            fastqc | Found 28 reports
|           multiqc | Compressing plot data
|           multiqc | Report      : multiqc_report.html
|           multiqc | Data        : multiqc_data
|           multiqc | MultiQC complete
[huqj@login0b Raw]$ ^C
[huqj@login0b Raw]$ ls
23L000903_1_fastqc.html  23L000906_1_fastqc.html  23L000909_1_fastqc.html  23L000912_1_fastqc.html  23L000919_1_fastqc.html
23L000903_1_fastqc.zip   23L000906_1_fastqc.zip   23L000909_1_fastqc.zip   23L000912_1_fastqc.zip   23L000919_1_fastqc.zip
23L000903_2_fastqc.html  23L000906_2_fastqc.html  23L000909_2_fastqc.html  23L000912_2_fastqc.html  23L000919_2_fastqc.html
23L000903_2_fastqc.zip   23L000906_2_fastqc.zip   23L000909_2_fastqc.zip   23L000912_2_fastqc.zip   23L000919_2_fastqc.zip
23L000904_1_fastqc.html  23L000907_1_fastqc.html  23L000910_1_fastqc.html  23L000913_1_fastqc.html  23L000920_1_fastqc.html
23L000904_1_fastqc.zip   23L000907_1_fastqc.zip   23L000910_1_fastqc.zip   23L000913_1_fastqc.zip   23L000920_1_fastqc.zip
23L000904_2_fastqc.html  23L000907_2_fastqc.html  23L000910_2_fastqc.html  23L000913_2_fastqc.html  23L000920_2_fastqc.html
23L000904_2_fastqc.zip   23L000907_2_fastqc.zip   23L000910_2_fastqc.zip   23L000913_2_fastqc.zip   23L000920_2_fastqc.zip
23L000905_1_fastqc.html  23L000908_1_fastqc.html  23L000911_1_fastqc.html  23L000914_1_fastqc.html  multiqc_data
23L000905_1_fastqc.zip   23L000908_1_fastqc.zip   23L000911_1_fastqc.zip   23L000914_1_fastqc.zip   multiqc_report.html
23L000905_2_fastqc.html  23L000908_2_fastqc.html  23L000911_2_fastqc.html  23L000914_2_fastqc.html
23L000905_2_fastqc.zip   23L000908_2_fastqc.zip   23L000911_2_fastqc.zip   23L000914_2_fastqc.zip

4. Run Cutadapt

[huqj@login0b ~]$ cd /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/Jobs/Cutadapt
[huqj@login0b Cutadapt]$ ls
cutadapt.job  OUT
[huqj@login0b Cutadapt]$ vim cutadapt.job
[huqj@login0b Cutadapt]$ sbatch cutadapt.job
#! /bin/bash
#SBATCH -N 1
#SBATCH -J cutadapt
#SBATCH -c 2
#SBATCH -t 2:00:00
#SBATCH -o OUT/cutadapt-%A_%a.out
#SBATCH --array=0-13 # job array index
#SBATCH --mail-type=ALL
#SBATCH --mail-user=huqj@pitt.edu
########################################

## Cutadapt set-up
module purge
module load cutadapt/2.10

set -x
#########################

project=/bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS

names=($(cat $project/Data/raw/Mouse_PCLS.txt))
echo ${names[${SLURM_ARRAY_TASK_ID}]}
sample=${names[${SLURM_ARRAY_TASK_ID}]}

fastq=$project/Data/merged_fastq
out=$project/Data/Cutadapt


##########################


mkdir -p $out

cutadapt -m 50 -q 15 \
   -a CTGTCTCTTATA \
   -A CTGTCTCTTATA \
   -u 5 \
   -U 5 \
   --pair-filter=any \
   -o $out/${sample}_1.cutadapt.fastq.gz \
   -p $out/${sample}_2.cutadapt.fastq.gz \
    $fastq/${sample}_1.fastq.gz $fastq/${sample}_2.fastq.gz

check report


[huqj@login0b OUT]$ grep 'Total written (filtered)' *
cutadapt-1938038_0.out:Total written (filtered):  5,737,433,977 bp (92.6%)
cutadapt-1938038_10.out:Total written (filtered):  6,503,310,479 bp (91.7%)
cutadapt-1938038_11.out:Total written (filtered):  5,726,695,218 bp (91.6%)
cutadapt-1938038_12.out:Total written (filtered):  5,273,527,593 bp (92.5%)
cutadapt-1938038_13.out:Total written (filtered):  5,565,266,902 bp (92.6%)
cutadapt-1938038_1.out:Total written (filtered):  5,531,526,492 bp (91.8%)
cutadapt-1938038_2.out:Total written (filtered):  5,597,951,508 bp (92.3%)
cutadapt-1938038_3.out:Total written (filtered):  5,887,894,091 bp (91.9%)
cutadapt-1938038_4.out:Total written (filtered):  4,839,106,610 bp (92.6%)
cutadapt-1938038_5.out:Total written (filtered):  5,631,820,340 bp (92.7%)
cutadapt-1938038_6.out:Total written (filtered):  6,610,218,044 bp (92.7%)
cutadapt-1938038_7.out:Total written (filtered):  5,979,813,844 bp (92.8%)
cutadapt-1938038_8.out:Total written (filtered):  6,135,924,682 bp (92.5%)
cutadapt-1938038_9.out:Total written (filtered):  5,743,278,028 bp (92.7%)

5. QC after cutadapt

5.1 Run fastq_screen

5.1.1 edit fastq_screen.jon

[huqj@login0b fastq_screen]$ cd /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/Jobs/fastq_screen
[huqj@login0b fastq_screen]$ vim fastq_screen.job
#!/bin/bash
#
#SBATCH --job-name=fastq_screen
#SBATCH -N 1
#SBATCH --cpus-per-task=2 # Request that ncpus be allocated per process.
#SBATCH -t 1-00:00 # Runtime in D-HH:MM
#SBATCH -o OUT/fastq_screen-%A_%a.out
#SBATCH --mail-type=ALL
#SBATCH --mail-user=huqj@pitt.edu

module load bowtie2/2.4.5
module load fastq_screen/0.13.0


project=/bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS
files=$project/Data/Cutadapt
outfile=$project/QC/fastq_screen
confile=$project/Jobs/fastq_screen/fastq_screen.conf

fastq_screen --conf $confile $files/*cutadapt.fastq.gz --outdir $outfile

5.1.2 edit fastq_screen.conf to add aligner and database

[huqj@login0b fastq_screen]$ vim fastq_screen.conf
# This is an example configuration file for FastQ Screen

############################
## Bowtie, Bowtie 2 or BWA #
############################
## If the Bowtie, Bowtie 2 or BWA binary is not in your PATH, you can set
## this value to tell the program where to find your chosen aligner.  Uncomment
## the relevant line below and set the appropriate location.  Please note,
## this path should INCLUDE the executable filename.

#BOWTIE /usr/local/bin/bowtie/bowtie
BOWTIE2 /ihome/crc/install/bowtie2/bowtie2-2.4.5-linux-x86_64/bowtie2
#BWA /usr/local/bwa/bwa



############################################
## Bismark (for bisulfite sequencing only) #
############################################
## If the Bismark binary is not in your PATH then you can set this value to
## tell the program where to find it.  Uncomment the line below and set the
## appropriate location. Please note, this path should INCLUDE the executable
## filename.

#BISMARK        /usr/local/bin/bismark/bismark



############
## Threads #
############
## Genome aligners can be made to run across multiple CPU cores to speed up
## searches.  Set this value to the number of cores you want for mapping reads.

THREADS         8



##############
## DATABASES #
##############
## This section enables you to configure multiple genomes databases (aligner index
## files) to search against in your screen.  For each genome you need to provide a
## database name (which can't contain spaces) and the location of the aligner index
## files.
##
## The path to the index files SHOULD INCLUDE THE BASENAME of the index, e.g:
## /data/public/Genomes/Human_Bowtie/GRCh37/Homo_sapiens.GRCh37
## Thus, the index files (Homo_sapiens.GRCh37.1.bt2, Homo_sapiens.GRCh37.2.bt2, etc.)
## are found in a folder named 'GRCh37'.
##
## If, for example, the Bowtie, Bowtie2 and BWA indices of a given genome reside in
## the SAME FOLDER, a SINLGE path may be provided to ALL the of indices.  The index
## used will be the one compatible with the chosen aligner (as specified using the
## --aligner flag).
##
## The entries shown below are only suggested examples, you can add as many DATABASE
## sections as required, and you can comment out or remove as many of the existing
## entries as desired.  We suggest including genomes and sequences that may be sources
## of contamination either because they where run on your sequencer previously, or may
## have contaminated your sample during the library preparation step.
##
## Human - sequences available from
## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/
DATABASE        Human   /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/Refs/fastq_screen_database/Human/GRCh38_noalt_as/GRCh38_noalt_as
##
## Mouse - sequence available from
## ftp://ftp.ensembl.org/pub/current/fasta/mus_musculus/dna/
DATABASE        Mouse   /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/Refs/fastq_screen_database/Mouse/GRCm39/GRCm39
##
## Ecoli- sequence available from EMBL accession U00096.2
#DATABASE       Ecoli   /data/public/Genomes/Ecoli/Ecoli
##
## PhiX - sequence available from Refseq accession NC_001422.1
#DATABASE       PhiX    /data/public/Genomes/PhiX/phi_plus_SNPs
##
## Adapters - sequence derived from the FastQC contaminats file found at: www.bioinformatics.babraham.ac.uk/projects/fastqc
#DATABASE       Adapters        /data/public/Genomes/Contaminants/Contaminants
##
## Vector - Sequence taken from the UniVec database
## http://www.ncbi.nlm.nih.gov/VecScreen/UniVec.html
#DATABASE       Vectors         /data/public/Genomes/Vectors/Vectors

5.1.3 submit job

[huqj@login0b fastq_screen]$ sbatch fastq_screen.job

5.1.4 download the report from HTC cluster to local

qianjianghu@Qianjiangs-MacBook-Pro: ~
$ scp -r huqj@htc.crc.pitt.edu:/bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_human_PCLS/QC/fastq_screen/ ./Desktop/ML_PCLS
huqj@htc.crc.pitt.edu's password:
23L000915_1.cutadapt_screen.txt               100%  476     8.4KB/s   00:00
23L000918_2.cutadapt_screen.txt               100%  474     8.4KB/s   00:00
23L000922_1.cutadapt_screen.txt               100%  474     6.5KB/s   00:00
23L000919_1.cutadapt_screen.txt               100%  475    10.0KB/s   00:00
23L000922_1.cutadapt_screen.html              100% 3281KB   1.4MB/s   00:02
23L000915_2.cutadapt_screen.html              100% 3281KB   2.6MB/s   00:01
23L000919_2.cutadapt_screen.txt               100%  475     6.4KB/s   00:00
23L000917_2.cutadapt_screen.html              100% 3281KB   3.2MB/s   00:01
23L000919_1.cutadapt_screen.html              100% 3281KB   2.2MB/s   00:01
23L000915_2.cutadapt_screen.txt               100%  476    10.0KB/s   00:00
23L000920_1.cutadapt_screen.html              100% 3281KB   1.8MB/s   00:01
23L000918_1.cutadapt_screen.txt               100%  474     6.3KB/s   00:00
23L000922_2.cutadapt_screen.txt               100%  474     4.2KB/s   00:00
23L000921_1.cutadapt_screen.html              100% 3281KB   2.0MB/s   00:01
23L000918_1.cutadapt_screen.html              100% 3281KB   2.1MB/s   00:01
23L000916_2.cutadapt_screen.html              100% 3281KB   1.8MB/s   00:01
23L000917_1.cutadapt_screen.txt               100%  476     7.1KB/s   00:00
23L000920_2.cutadapt_screen.html              100% 3281KB   2.3MB/s   00:01
23L000920_1.cutadapt_screen.txt               100%  473     5.6KB/s   00:00
23L000921_2.cutadapt_screen.txt               100%  473     3.8KB/s   00:00
23L000919_2.cutadapt_screen.html              100% 3281KB   2.3MB/s   00:01
23L000917_1.cutadapt_screen.html              100% 3281KB   2.3MB/s   00:01
23L000916_2.cutadapt_screen.txt               100%  473     3.3KB/s   00:00
23L000916_1.cutadapt_screen.html              100% 3281KB   2.3MB/s   00:01
23L000918_2.cutadapt_screen.html              100% 3281KB   2.1MB/s   00:01
23L000921_2.cutadapt_screen.html              100% 3281KB   1.9MB/s   00:01
23L000921_1.cutadapt_screen.txt               100%  473     2.3KB/s   00:00
23L000916_1.cutadapt_screen.txt               100%  474     4.2KB/s   00:00
23L000917_2.cutadapt_screen.txt               100%  476     6.7KB/s   00:00
23L000920_2.cutadapt_screen.txt               100%  474     4.1KB/s   00:00
23L000915_1.cutadapt_screen.html              100% 3281KB   2.2MB/s   00:01
23L000922_2.cutadapt_screen.html              100% 3281KB   2.0MB/s   00:01
(base)

5.2 Run FastQC

[huqj@login0b fastq_screen]$ cd /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/Jobs/FastQC/
[huqj@login0b FastQC]$ vim fastqc_cutadapt.job
[huqj@login0b FastQC]$ sbatch fastqc_cutadapt.job
#!/bin/bash
#SBATCH -J fastqc_cutadapt
#SBATCH -c 12
#SBATCH -t 2:00:00
#SBATCH -o OUT_cutadapt/fastqc-%A_%a.out
#SBATCH --array=0-13 # job array index
#SBATCH --mail-type=ALL
#SBATCH --mail-user=huqj@pitt.edu
###########

####### set-up fastqc
module load fastqc/0.11.7

set -x


project=/bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS

names=($(cat $project/Data/raw/Mouse_PCLS.txt))
echo ${names[${SLURM_ARRAY_TASK_ID}]}
sample=${names[${SLURM_ARRAY_TASK_ID}]}

fastq=$project/Data/Cutadapt
out=$project/QC/FastQC/Cutadapt/

################

mkdir -p $out

fastqc -o $out $fastq/${sample}_*_1.cutadapt.fastq.gz
fastqc -o $out $fastq/${sample}_*_2.cutadapt.fastq.gz

check report

[huqj@login0b OUT_cutadapt]$ grep 'Analysis complete for' *.out
fastqc-1938131_0.out:Analysis complete for 23L000903_1.cutadapt.fastq.gz
fastqc-1938131_0.out:Analysis complete for 23L000903_2.cutadapt.fastq.gz
fastqc-1938131_10.out:Analysis complete for 23L000913_1.cutadapt.fastq.gz
fastqc-1938131_10.out:Analysis complete for 23L000913_2.cutadapt.fastq.gz
fastqc-1938131_11.out:Analysis complete for 23L000914_1.cutadapt.fastq.gz
fastqc-1938131_11.out:Analysis complete for 23L000914_2.cutadapt.fastq.gz
fastqc-1938131_12.out:Analysis complete for 23L000919_1.cutadapt.fastq.gz
fastqc-1938131_12.out:Analysis complete for 23L000919_2.cutadapt.fastq.gz
fastqc-1938131_13.out:Analysis complete for 23L000920_1.cutadapt.fastq.gz
fastqc-1938131_13.out:Analysis complete for 23L000920_2.cutadapt.fastq.gz
fastqc-1938131_1.out:Analysis complete for 23L000904_1.cutadapt.fastq.gz
fastqc-1938131_1.out:Analysis complete for 23L000904_2.cutadapt.fastq.gz
fastqc-1938131_2.out:Analysis complete for 23L000905_1.cutadapt.fastq.gz
fastqc-1938131_2.out:Analysis complete for 23L000905_2.cutadapt.fastq.gz
fastqc-1938131_3.out:Analysis complete for 23L000906_1.cutadapt.fastq.gz
fastqc-1938131_3.out:Analysis complete for 23L000906_2.cutadapt.fastq.gz
fastqc-1938131_4.out:Analysis complete for 23L000907_1.cutadapt.fastq.gz
fastqc-1938131_4.out:Analysis complete for 23L000907_2.cutadapt.fastq.gz
fastqc-1938131_5.out:Analysis complete for 23L000908_1.cutadapt.fastq.gz
fastqc-1938131_5.out:Analysis complete for 23L000908_2.cutadapt.fastq.gz
fastqc-1938131_6.out:Analysis complete for 23L000909_1.cutadapt.fastq.gz
fastqc-1938131_6.out:Analysis complete for 23L000909_2.cutadapt.fastq.gz
fastqc-1938131_7.out:Analysis complete for 23L000910_1.cutadapt.fastq.gz
fastqc-1938131_7.out:Analysis complete for 23L000910_2.cutadapt.fastq.gz
fastqc-1938131_8.out:Analysis complete for 23L000911_1.cutadapt.fastq.gz
fastqc-1938131_8.out:Analysis complete for 23L000911_2.cutadapt.fastq.gz
fastqc-1938131_9.out:Analysis complete for 23L000912_1.cutadapt.fastq.gz
fastqc-1938131_9.out:Analysis complete for 23L000912_2.cutadapt.fastq.gz

5.3 Run MultiQC

[huqj@login0b]$ cd /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt
[huqj@login0b Cutadapt]$ module load multiqc/1.12
[huqj@login0b Cutadapt]$ multiqc *.zip

  /// MultiQC 🔍 | v1.12

|           multiqc | MultiQC Version v1.14 now available!
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000903_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000903_2.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000904_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000904_2.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000905_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000905_2.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000906_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000906_2.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000907_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000907_2.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000908_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000908_2.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000909_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000909_2.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000910_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000910_2.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000911_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000911_2.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000912_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000912_2.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000913_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000913_2.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000914_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000914_2.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000919_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000919_2.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000920_1.cutadapt_fastqc.zip
|           multiqc | Search path : /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/FastQC/Cutadapt/23L000920_2.cutadapt_fastqc.zip
|         searching | ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 28/28
|            fastqc | Found 28 reports
|           multiqc | Compressing plot data
|           multiqc | Report      : multiqc_report.html
|           multiqc | Data        : multiqc_data
|           multiqc | MultiQC complete

5.4 download the report to local

qianjianghu@Qianjiangs-MacBook-Pro: ~
$ scp -r huqj@htc.crc.pitt.edu:/bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/QC/ ./Desktop/ML_mouse_PCLS/
huqj@htc.crc.pitt.edu's password:
23L000910_2.cutadapt_screen.html                           100% 3281KB   1.6MB/s   00:02
23L000912_1.cutadapt_screen.txt                            100%  472     6.9KB/s   00:00
23L000909_1.cutadapt_screen.html                           100% 3281KB   2.9MB/s   00:01
23L000919_2.cutadapt_screen.txt                            100%  475     7.7KB/s   00:00
23L000907_2.cutadapt_screen.html                           100% 3281KB   3.0MB/s   00:01
23L000914_1.cutadapt_screen.txt                            100%  473     6.8KB/s   00:00
23L000913_2.cutadapt_screen.txt                            100%  474     9.3KB/s   00:00
23L000903_2.cutadapt_screen.txt                            100%  472     7.4KB/s   00:00
23L000905_2.cutadapt_screen.txt                            100%  476     9.0KB/s   00:00
23L000908_1.cutadapt_screen.txt                            100%  472     7.5KB/s   00:00
23L000906_2.cutadapt_screen.html                           100% 3281KB   3.7MB/s   00:00
23L000903_1.cutadapt_screen.html                           100% 3281KB   4.0MB/s   00:00
23L000909_2.cutadapt_screen.txt                            100%  474     9.2KB/s   00:00
23L000908_1.cutadapt_screen.html                           100% 3281KB   3.6MB/s   00:00
23L000904_1.cutadapt_screen.txt                            100%  472     8.2KB/s   00:00
23L000911_2.cutadapt_screen.html                           100% 3281KB   3.5MB/s   00:00
23L000914_1.cutadapt_screen.html                           100% 3281KB   4.0MB/s   00:00
23L000913_1.cutadapt_screen.txt                            100%  474     9.8KB/s   00:00
23L000904_2.cutadapt_screen.html                           100% 3281KB   4.3MB/s   00:00
23L000913_2.cutadapt_screen.html                           100% 3281KB   4.5MB/s   00:00
23L000919_1.cutadapt_screen.txt                            100%  475    10.1KB/s   00:00
23L000914_2.cutadapt_screen.txt                            100%  475     6.2KB/s   00:00
23L000912_2.cutadapt_screen.txt                            100%  472     6.4KB/s   00:00
23L000909_1.cutadapt_screen.txt                            100%  474     9.3KB/s   00:00
23L000904_2.cutadapt_screen.txt                            100%  472     7.2KB/s   00:00
23L000920_2.cutadapt_screen.html                           100% 3281KB   5.8MB/s   00:00
23L000905_1.cutadapt_screen.txt                            100%  476    12.0KB/s   00:00
23L000919_2.cutadapt_screen.html                           100% 3281KB   4.4MB/s   00:00
23L000912_2.cutadapt_screen.html                           100% 3281KB   5.0MB/s   00:00
23L000908_2.cutadapt_screen.txt                            100%  472     4.7KB/s   00:00
23L000905_2.cutadapt_screen.html                           100% 3281KB   5.7MB/s   00:00
23L000903_1.cutadapt_screen.txt                            100%  472    10.4KB/s   00:00
23L000910_1.cutadapt_screen.txt                            100%  474     6.1KB/s   00:00
23L000904_1.cutadapt_screen.html                           100% 3281KB   6.6MB/s   00:00
23L000911_2.cutadapt_screen.txt                            100%  477     4.3KB/s   00:00
23L000913_1.cutadapt_screen.html                           100% 3281KB   5.6MB/s   00:00
23L000920_2.cutadapt_screen.txt                            100%  474     6.5KB/s   00:00
23L000919_1.cutadapt_screen.html                           100% 3281KB   5.0MB/s   00:00
23L000912_1.cutadapt_screen.html                           100% 3281KB   4.0MB/s   00:00
23L000907_2.cutadapt_screen.txt                            100%  473     9.5KB/s   00:00
23L000905_1.cutadapt_screen.html                           100% 3281KB   4.4MB/s   00:00
23L000920_1.cutadapt_screen.html                           100% 3281KB   4.6MB/s   00:00
23L000906_1.cutadapt_screen.txt                            100%  471     8.0KB/s   00:00
23L000920_1.cutadapt_screen.txt                            100%  473     3.9KB/s   00:00
23L000911_1.cutadapt_screen.txt                            100%  474     7.8KB/s   00:00
23L000910_2.cutadapt_screen.txt                            100%  475     3.7KB/s   00:00
23L000910_1.cutadapt_screen.html                           100% 3281KB   4.9MB/s   00:00
23L000907_1.cutadapt_screen.html                           100% 3281KB   5.2MB/s   00:00
23L000909_2.cutadapt_screen.html                           100% 3281KB   4.3MB/s   00:00
23L000903_2.cutadapt_screen.html                           100% 3281KB   3.4MB/s   00:00
23L000908_2.cutadapt_screen.html                           100% 3281KB   4.0MB/s   00:00
23L000906_2.cutadapt_screen.txt                            100%  472     9.3KB/s   00:00
23L000906_1.cutadapt_screen.html                           100% 3281KB   3.3MB/s   00:00
23L000914_2.cutadapt_screen.html                           100% 3281KB   2.8MB/s   00:01
23L000911_1.cutadapt_screen.html                           100% 3281KB   2.7MB/s   00:01
23L000907_1.cutadapt_screen.txt                            100%  472     3.9KB/s   00:00
23L000919_1.cutadapt_fastqc.zip                            100%  707KB   2.6MB/s   00:00
23L000914_2.cutadapt_fastqc.zip                            100%  703KB   2.4MB/s   00:00
23L000912_2.cutadapt_fastqc.zip                            100%  706KB   2.7MB/s   00:00
23L000913_1.cutadapt_fastqc.zip                            100%  704KB   3.4MB/s   00:00
23L000910_1.cutadapt_fastqc.html                           100%  238KB   1.2MB/s   00:00
23L000909_2.cutadapt_fastqc.html                           100%  239KB   1.2MB/s   00:00
23L000907_1.cutadapt_fastqc.html                           100%  244KB   1.0MB/s   00:00
23L000905_1.cutadapt_fastqc.zip                            100%  705KB   2.4MB/s   00:00
23L000906_1.cutadapt_fastqc.html                           100%  238KB   1.7MB/s   00:00
23L000908_2.cutadapt_fastqc.zip                            100%  704KB   1.9MB/s   00:00
23L000903_2.cutadapt_fastqc.html                           100%  241KB   1.3MB/s   00:00
23L000908_2.cutadapt_fastqc.html                           100%  238KB   1.7MB/s   00:00
23L000903_1.cutadapt_fastqc.zip                            100%  711KB   1.9MB/s   00:00
23L000911_1.cutadapt_fastqc.html                           100%  244KB   1.4MB/s   00:00
23L000914_2.cutadapt_fastqc.html                           100%  238KB   1.4MB/s   00:00
23L000909_1.cutadapt_fastqc.zip                            100%  707KB   2.0MB/s   00:00
23L000904_2.cutadapt_fastqc.zip                            100%  717KB   2.5MB/s   00:00
23L000913_2.cutadapt_fastqc.zip                            100%  705KB   2.6MB/s   00:00
23L000912_1.cutadapt_fastqc.zip                            100%  712KB   2.2MB/s   00:00
23L000904_1.cutadapt_fastqc.html                           100%  246KB   2.0MB/s   00:00
23L000919_2.cutadapt_fastqc.zip                            100%  707KB   2.1MB/s   00:00
23L000913_1.cutadapt_fastqc.html                           100%  238KB   1.5MB/s   00:00
23L000914_1.cutadapt_fastqc.zip                            100%  704KB   2.5MB/s   00:00
23L000919_1.cutadapt_fastqc.html                           100%  240KB   1.7MB/s   00:00
23L000912_1.cutadapt_fastqc.html                           100%  245KB   1.7MB/s   00:00
23L000909_2.cutadapt_fastqc.zip                            100%  704KB   2.8MB/s   00:00
23L000904_1.cutadapt_fastqc.zip                            100%  717KB   3.1MB/s   00:00
23L000905_1.cutadapt_fastqc.html                           100%  238KB   1.7MB/s   00:00
23L000903_2.cutadapt_fastqc.zip                            100%  707KB   2.3MB/s   00:00
23L000905_2.cutadapt_fastqc.zip                            100%  706KB   2.5MB/s   00:00
23L000920_1.cutadapt_fastqc.html                           100%  245KB   1.6MB/s   00:00
23L000908_1.cutadapt_fastqc.zip                            100%  705KB   2.6MB/s   00:00
23L000910_2.cutadapt_fastqc.zip                            100%  705KB   2.2MB/s   00:00
23L000904_2.cutadapt_fastqc.html                           100%  247KB   1.7MB/s   00:00
23L000913_2.cutadapt_fastqc.html                           100%  239KB   1.4MB/s   00:00
23L000920_1.cutadapt_fastqc.zip                            100%  712KB   2.2MB/s   00:00
23L000911_1.cutadapt_fastqc.zip                            100%  711KB   2.4MB/s   00:00
23L000907_1.cutadapt_fastqc.zip                            100%  712KB   2.4MB/s   00:00
23L000920_2.cutadapt_fastqc.html                           100%  239KB   1.8MB/s   00:00
23L000919_2.cutadapt_fastqc.html                           100%  241KB   1.6MB/s   00:00
23L000906_2.cutadapt_fastqc.zip                            100%  702KB   2.4MB/s   00:00
23L000912_2.cutadapt_fastqc.html                           100%  240KB   1.5MB/s   00:00
multiqc_report.html                                        100% 1466KB   3.0MB/s   00:00
23L000905_2.cutadapt_fastqc.html                           100%  240KB   1.6MB/s   00:00
23L000910_2.cutadapt_fastqc.html                           100%  240KB   1.6MB/s   00:00
23L000911_2.cutadapt_fastqc.zip                            100%  710KB   2.2MB/s   00:00
multiqc_citations.txt                                      100%   62     1.3KB/s   00:00
multiqc_data.json                                          100% 1672KB   3.1MB/s   00:00
multiqc_general_stats.txt                                  100% 2545    47.1KB/s   00:00
multiqc_fastqc.txt                                         100% 6159   101.2KB/s   00:00
multiqc_sources.txt                                        100% 4010    81.6KB/s   00:00
multiqc.log                                                100%   11KB 191.7KB/s   00:00
23L000907_2.cutadapt_fastqc.html                           100%  243KB   2.2MB/s   00:00
23L000909_1.cutadapt_fastqc.html                           100%  241KB   1.5MB/s   00:00
23L000920_2.cutadapt_fastqc.zip                            100%  705KB   2.2MB/s   00:00
23L000910_1.cutadapt_fastqc.zip                            100%  704KB   2.7MB/s   00:00
23L000906_1.cutadapt_fastqc.zip                            100%  701KB   2.8MB/s   00:00
23L000903_1.cutadapt_fastqc.html                           100%  243KB   1.7MB/s   00:00
23L000908_1.cutadapt_fastqc.html                           100%  239KB   1.6MB/s   00:00
23L000906_2.cutadapt_fastqc.html                           100%  238KB   1.4MB/s   00:00
23L000907_2.cutadapt_fastqc.zip                            100%  710KB   2.9MB/s   00:00
23L000914_1.cutadapt_fastqc.html                           100%  238KB   2.0MB/s   00:00
23L000911_2.cutadapt_fastqc.html                           100%  243KB   1.6MB/s   00:00
23L000907_1_fastqc.zip                                     100%  756KB   2.8MB/s   00:00
23L000920_2_fastqc.html                                    100%  253KB   1.4MB/s   00:00
23L000909_1_fastqc.html                                    100%  259KB   1.8MB/s   00:00
23L000906_1_fastqc.zip                                     100%  752KB   2.5MB/s   00:00
23L000908_2_fastqc.html                                    100%  253KB   2.3MB/s   00:00
23L000919_2_fastqc.zip                                     100%  745KB   2.9MB/s   00:00
23L000904_1_fastqc.zip                                     100%  760KB   3.2MB/s   00:00
23L000905_1_fastqc.zip                                     100%  752KB   2.3MB/s   00:00
23L000919_2_fastqc.html                                    100%  252KB   1.9MB/s   00:00
23L000903_1_fastqc.html                                    100%  260KB   2.4MB/s   00:00
23L000907_2_fastqc.html                                    100%  253KB   2.1MB/s   00:00
23L000912_1_fastqc.html                                    100%  261KB   1.9MB/s   00:00
23L000903_1_fastqc.zip                                     100%  757KB   3.6MB/s   00:00
23L000913_2_fastqc.html                                    100%  251KB   1.6MB/s   00:00
23L000906_1_fastqc.html                                    100%  255KB   1.9MB/s   00:00
23L000905_2_fastqc.html                                    100%  250KB   1.2MB/s   00:00
23L000910_1_fastqc.html                                    100%  257KB   2.0MB/s   00:00
23L000914_2_fastqc.zip                                     100%  742KB   2.6MB/s   00:00
23L000909_1_fastqc.zip                                     100%  754KB   3.0MB/s   00:00
23L000911_2_fastqc.html                                    100%  256KB   1.6MB/s   00:00
23L000904_1_fastqc.html                                    100%  264KB   2.3MB/s   00:00
23L000908_1_fastqc.zip                                     100%  754KB   2.9MB/s   00:00
23L000914_2_fastqc.html                                    100%  250KB   1.8MB/s   00:00
23L000920_1_fastqc.zip                                     100%  757KB   2.5MB/s   00:00
23L000913_2_fastqc.zip                                     100%  743KB   2.7MB/s   00:00
23L000912_2_fastqc.zip                                     100%  748KB   2.7MB/s   00:00
23L000910_2_fastqc.zip                                     100%  749KB   2.7MB/s   00:00
multiqc_sources.txt                                        100% 3618    62.7KB/s   00:00
multiqc_general_stats.txt                                  100% 2213    32.6KB/s   00:00
multiqc_citations.txt                                      100%   62     1.4KB/s   00:00
multiqc.log                                                100%   10KB 174.1KB/s   00:00
multiqc_data.json                                          100% 1710KB   3.2MB/s   00:00
multiqc_fastqc.txt                                         100% 5578   105.2KB/s   00:00
23L000911_2_fastqc.zip                                     100%  753KB   3.2MB/s   00:00
multiqc_report.html                                        100% 1484KB   3.6MB/s   00:00
23L000903_2_fastqc.zip                                     100%  748KB   2.7MB/s   00:00
23L000914_1_fastqc.html                                    100%  257KB   1.7MB/s   00:00
23L000907_2_fastqc.zip                                     100%  748KB   2.8MB/s   00:00
23L000906_2_fastqc.zip                                     100%  742KB   3.0MB/s   00:00
23L000911_1_fastqc.html                                    100%  261KB   1.7MB/s   00:00
23L000904_2_fastqc.html                                    100%  258KB   2.0MB/s   00:00
23L000904_2_fastqc.zip                                     100%  754KB   3.1MB/s   00:00
23L000919_1_fastqc.zip                                     100%  755KB   3.3MB/s   00:00
23L000905_1_fastqc.html                                    100%  257KB   1.8MB/s   00:00
23L000910_2_fastqc.html                                    100%  255KB   1.5MB/s   00:00
23L000905_2_fastqc.zip                                     100%  743KB   2.5MB/s   00:00
23L000913_1_fastqc.html                                    100%  258KB   2.1MB/s   00:00
23L000906_2_fastqc.html                                    100%  251KB   2.0MB/s   00:00
23L000913_1_fastqc.zip                                     100%  755KB   2.5MB/s   00:00
23L000907_1_fastqc.html                                    100%  261KB   1.9MB/s   00:00
23L000912_2_fastqc.html                                    100%  254KB   1.8MB/s   00:00
23L000912_1_fastqc.zip                                     100%  757KB   3.1MB/s   00:00
23L000910_1_fastqc.zip                                     100%  752KB   2.5MB/s   00:00
23L000903_2_fastqc.html                                    100%  254KB   1.4MB/s   00:00
23L000911_1_fastqc.zip                                     100%  759KB   2.9MB/s   00:00
23L000909_2_fastqc.zip                                     100%  745KB   3.1MB/s   00:00
23L000914_1_fastqc.zip                                     100%  754KB   3.0MB/s   00:00
23L000908_2_fastqc.zip                                     100%  744KB   3.1MB/s   00:00
23L000919_1_fastqc.html                                    100%  259KB   1.8MB/s   00:00
23L000920_1_fastqc.html                                    100%  260KB   1.8MB/s   00:00
23L000920_2_fastqc.zip                                     100%  748KB   2.9MB/s   00:00
23L000908_1_fastqc.html                                    100%  257KB   2.1MB/s   00:00
23L000909_2_fastqc.html                                    100%  251KB   1.6MB/s   00:00
(base)
qianjianghu@Qianjiangs-MacBook-Pro: ~
$

6. Run Hisat2

6.1 Build Index

6.2 run Hisat2

[huqj@login0b]$ cd /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/Jobs/Hisat2
[huqj@login0b Hisat2]$ vim hisat2.job

[huqj@login0b Hisat2]$ sbatch hisat2.job
#! /bin/bash
#
#BATCH -N 1
#SBATCH -J HISAT2
#SBATCH -t 2:00:00
#SBATCH -c 8
#SBATCH -o OUT/hisat2-%A_%a.out
#SBATCH --array=0-13 # job array index
#SBATCH --mail-type=ALL
#SBATCH --mail-user=huqj@pitt.edu
#############################

## HISAT2 set-up
module load gcc/8.2.0
module load hisat2/2.2.1
module load samtools/1.9

set -x

################################

project=/bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS

names=($(cat $project/Data/raw/Mouse_PCLS.txt))
echo ${names[${SLURM_ARRAY_TASK_ID}]}
sample=${names[${SLURM_ARRAY_TASK_ID}]}

trimfastq=$project/Data/Cutadapt
out=$project/Mapping/HISAT2
out_sort=$project/Mapping/HISAT2_sort
ref=/bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/Refs/Index/Hisat2_Index/Mus_musculus/index/GRCm39_release_108/grcm39_tran/grcm39_tran

####################################

mkdir -p $out
mkdir -p $out_sort

###################

hisat2 -x $ref \
        -S $out/${sample}.sam \
        -p 8 \
        --dta \
        -1 $trimfastq/${sample}_1.cutadapt.fastq.gz \
        -2 $trimfastq/${sample}_2.cutadapt.fastq.gz

samtools view -@ 3 -h -o $out/${sample}.bam $out/${sample}.sam
samtools sort $out/${sample}.bam -n -o $out_sort/${sample}.sorted.query.bam

check alignment results

[huqj@login0b Hisat2]$ cd OUT/
[huqj@login0b OUT]$ ls
hisat2-1938175_0.out   hisat2-1938175_12.out  hisat2-1938175_2.out  hisat2-1938175_5.out  hisat2-1938175_8.out
hisat2-1938175_10.out  hisat2-1938175_13.out  hisat2-1938175_3.out  hisat2-1938175_6.out  hisat2-1938175_9.out
hisat2-1938175_11.out  hisat2-1938175_1.out   hisat2-1938175_4.out  hisat2-1938175_7.out
[huqj@login0b OUT]$ grep "overall alignment rate" *.out
hisat2-1938175_0.out:97.05% overall alignment rate
hisat2-1938175_10.out:97.25% overall alignment rate
hisat2-1938175_11.out:97.52% overall alignment rate
hisat2-1938175_12.out:97.04% overall alignment rate
hisat2-1938175_13.out:97.01% overall alignment rate
hisat2-1938175_1.out:92.22% overall alignment rate
hisat2-1938175_2.out:97.09% overall alignment rate
hisat2-1938175_3.out:97.77% overall alignment rate
hisat2-1938175_4.out:95.67% overall alignment rate
hisat2-1938175_5.out:97.53% overall alignment rate
hisat2-1938175_6.out:97.62% overall alignment rate
hisat2-1938175_7.out:97.23% overall alignment rate
hisat2-1938175_8.out:94.78% overall alignment rate
hisat2-1938175_9.out:97.31% overall alignment rate

7. Run HTseq

7.1 submit the HTseq job

[huqj@login0b]$ cd /bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/Jobs/HT-Seq
[huqj@login0b HT-Seq]$ vim htseq.job
#! /bin/bash
#SBATCH -N 1
#SBATCH -t 1:00:00
#SBATCH -J htseq
#SBATCH -c 6
#SBATCH -o OUT/htseq-%A_%a.out
#SBATCH --array=0-13 # job array index 
#SBATCH --mail-type=ALL
#SBATCH --mail-user=huqj@pitt.edu
######################################


############ htseq set-up
module load htseq/0.13.5

set -x

###########################


project=/bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS

names=($(cat $project/Data/raw/Mouse_PCLS.txt))
echo ${names[${SLURM_ARRAY_TASK_ID}]}
sample=${names[${SLURM_ARRAY_TASK_ID}]}
#########################################
BAM=$project/Mapping/HISAT2_sort
gtf=/bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/Refs/Annotation/Mus_musculus/GRCm39_release_108/Mus_musculus.GRCm39.108.gtf
out=$project/Counts/HT-Seq

#############################

mkdir -p $out

htseq-count -f bam \
    -r name \
    -s reverse \  # this is the correct one for this seq data
    -t exon \
    -m union \
    -i gene_id \
    $BAM/$sample.sorted.query.bam \
    $gtf > $out/$sample.counts.txt

7.2 check results

[huqj@login0b HT-Seq]$ cat 23L000903.counts.txt | head
ENSMUSG00000000001	3551
ENSMUSG00000000003	0
ENSMUSG00000000028	110
ENSMUSG00000000031	24
ENSMUSG00000000037	15
ENSMUSG00000000049	1
ENSMUSG00000000056	997
ENSMUSG00000000058	2324
ENSMUSG00000000078	7886
ENSMUSG00000000085	921

8. download counts files to local MBP

qianjianghu@Qianjiangs-MacBook-Pro: ~
$ scp -r huqj@htc.crc.pitt.edu:/bgfs/mkoenigshoff/huqj/DataAnalysis/rna_seq/ML_mouse_PCLS/Counts/ /Users/qianjianghu/Desktop/ML_mouse_PCLS/
huqj@htc.crc.pitt.edu's password:
23L000910.counts.txt                          100% 1202KB   3.9MB/s   00:00
23L000906.counts.txt                          100% 1202KB   2.7MB/s   00:00
23L000919.counts.txt                          100% 1201KB   3.3MB/s   00:00
23L000904.counts.txt                          100% 1201KB   3.4MB/s   00:00
23L000912.counts.txt                          100% 1201KB   4.1MB/s   00:00
23L000903.counts.txt                          100% 1202KB   3.0MB/s   00:00
23L000908.counts.txt                          100% 1202KB   4.3MB/s   00:00
23L000914.counts.txt                          100% 1201KB   4.7MB/s   00:00
23L000909.counts.txt                          100% 1203KB   4.8MB/s   00:00
23L000905.counts.txt                          100% 1202KB   4.7MB/s   00:00
23L000913.counts.txt                          100% 1203KB   6.2MB/s   00:00
23L000920.counts.txt                          100% 1202KB   4.7MB/s   00:00
23L000911.counts.txt                          100% 1202KB   3.8MB/s   00:00
23L000907.counts.txt                          100% 1200KB   4.2MB/s   00:00
23L000903.txt                                 100%   20MB  14.3MB/s   00:01
23L000914.txt.summary                         100%  472     5.3KB/s   00:00
23L000904.txt                                 100%   20MB  20.3MB/s   00:01
23L000904.txt.summary                         100%  473     5.3KB/s   00:00
23L000909.txt.summary                         100%  473     7.8KB/s   00:00
23L000906.txt.summary                         100%  472     7.2KB/s   00:00
23L000920.txt                                 100%   20MB  14.7MB/s   00:01
23L000919.txt.summary                         100%  472     6.5KB/s   00:00
23L000912.txt                                 100%   20MB  19.1MB/s   00:01
23L000920.txt.summary                         100%  472     5.3KB/s   00:00
23L000910.txt.summary                         100%  472     3.8KB/s   00:00
23L000905.txt                                 100%   20MB  20.3MB/s   00:01
23L000913.txt                                 100%   20MB  20.4MB/s   00:01
23L000914.txt                                 100%   20MB  18.4MB/s   00:01
23L000912.txt.summary                         100%  472     7.4KB/s   00:00
23L000910.txt                                 100%   20MB  16.0MB/s   00:01
23L000919.txt                                 100%   20MB   9.9MB/s   00:02
23L000911.txt.summary                         100%  474     3.4KB/s   00:00
23L000913.txt.summary                         100%  473     4.1KB/s   00:00
23L000903.txt.summary                         100%  472     3.2KB/s   00:00
23L000908.txt                                 100%   20MB  11.1MB/s   00:01
23L000906.txt                                 100%   20MB  12.3MB/s   00:01
23L000905.txt.summary                         100%  472     8.3KB/s   00:00
23L000911.txt                                 100%   20MB  12.6MB/s   00:01
23L000907.txt                                 100%   20MB  10.5MB/s   00:01
23L000909.txt                                 100%   20MB  13.1MB/s   00:01
23L000907.txt.summary                         100%  472     4.0KB/s   00:00
23L000908.txt.summary                         100%  472     3.8KB/s   00:00
(base)