Structural identification and annotation repeatitive elements

#Installations :
conda create -n repeatmasking -c bioconda RepeatMasker repeatmodeler trf 

# Set path to assembled genome
genome=path_to_assembled_genome

# RepeatProteinMask
RepeatProteinMask $genome -noLowSimple -pvalue 0.0001 
cat ${genome}.annot | awk -v OFS='\t' '{print $4,$5,$6,$7,$8,$9,$10,$11}' | grep -v SeqID > repeatproteinmask.bed
bedtools getfasta -fi ${genome}.fna -bed repeatproteinmask.bed -fo RM.out.fasta  

# RepeatMasker
RepeatMasker -species Viridiplantae $genome -pa 8 

RM2Bed.py ${genome}.fna.out
bedtools getfasta -fi ${genome}.fna -bed ${genome}.rm.bed -fo RM.out.fasta  

# RepeatModeler 
BuildDatabase -name your_species_name ${genome}.fna
RepeatModeler -database your_species_name -pa 32 -LTRStruct  

# Miniature Inverted-repeat Transposable Elements (MITE)-Hunter
~/apps/MITE-hunter/MITE_Hunter_manager.pl -i ${genome}.fna -g ${genome}_MH -c 32 -n 5 -S 12345678 –P 1

## MITE_Hunter_manager TEs:  ${genome}_MH_Step8_singlet.fa 

# Tandem Repeat Finder (TRF)
trf ${genome}.fna 2 5 7 80 10 50 2000 -m -h 

# Convert TRF to fasta:
# source:TRFdat_to_bed.py #source: https://raw.githubusercontent.com/hdashnow/TandemRepeatFinder_scripts/master/TRFdat_to_bed.py

./TRFdat_to_bed.py --dat ${genome}.fna.2.7.7.80.10.50.500.dat --bed ${genome}.trf.bed 

bedtools getfasta -fi ${genome}.fna -bed ${genome}.trf.bed  > ${genome}.trf.fasta

# PILER as described in https://www.drive5.com/piler/. 
pals -self ${genome}.fna hit.gff
piler -trs hit.gff -out trs.gff
mkdir fams
piler -trs2fasta trs.gff -seq ${genome}.fasta -path fams
mkdir aligned_fams
cd fams
for fam in *
do
   muscle -in $fam -out ../aligned_fams/$fam -maxiters 1 -diags1
done

cd ..
mkdir cons
cd aligned_fams

for fam in *
do
   piler -cons $fam -out ../cons/$fam -label $fam
done
cd ../cons
cat * > ../${genome}.piler_library.fasta

# sequence redundancy removal - CD-HIT
# contatenate into one file  
cat ${genome}.RPM.out.fasta ${genome}.RM.fasta ${genome}.RMod.out.fasta ${genome}.EDTA.out.fasta ${genome}.trf.fasta ${genome}_MH_Step8_singlet.fa ${genome}.piler_library.fasta > redundant_repeat_lib.fasta 

cd-hit -i redundant_repeat_lib.fasta -d 0 -o redundant_repeat_lib.non_redun.fasta -c 0.80 -n 5 -G 1 -g 0 -M 0 -T 24

# Excluding highly conserved protein coding genes or multi-member gene families
# Here BLASTP search was used (e-value threshold of 1e-10) against monocots protein database downloaded from EnsemblPlants (https://plants.ensembl.org/index.html) and Banana Genome Hub.

# Extract the fasta files the predicted repeats from the various tools

rep_lib=redundant_repeat_lib.non_redun

makeblastdb -in blastdb/musa.arab_T_ensemblemonocot.enseteg.fa -input_type prot 
### BLASTP
blastx 
-db blastdb/musa.arab_T_ensemblemonocot.enseteg.fa \
-query $rep_lib \
-outfmt '6 qseqid qlen slen qstart qend sstart send stitle pident length evalue bitscore' \
-evalue 1e-10 \
-max_hsps 1 \
-num_threads 24 \
-out $rep_lib.blastx.out

# convert blast hits to bed file and exclude the bed file interval from fasta 
cut -f 1,4,5 $rep_lib.blastx.out | awk '{if ($2>$3) print $1,$3,$2,".",".","-"; else print $1,$2,$3,".",".","+";}' OFS='\t' | awk '{a=$2-1;print $1,a,$3,$4,$5,$6;}' OFS='\t'| bedtools sort > $rep_lib.protein_cleared.draft.bed
bedtools merge -i $rep_lib.protein_cleared.draft.bed -s -c 6 -o distinct > $rep_lib.protein_cleared.final.bed
# generate the index file 
samtools faidx $rep_lib
# reformate the index file to genome file 
awk -v OFS='\t' {'print $1,$2'} $rep_lib.fai > $rep_lib.txt
# final bed file to exclude  <br />
bedtools complement -i $rep_lib.protein_cleared.final.bed -g $rep_lib.txt > $rep_lib.final.bed
# final protein coding gene free final de novo repeats 
bedtools getfasta -fi ${rep_lib}.fasta -bed $rep_lib.final.bed > $rep_file.final

# Extensive de-novo Transposable Element Annotator (EDTA) 
# installation: https://github.com/oushujun/EDTA 

EDTA.pl \
-genome ${genome}.fna \
--species other \
--step all \
--overwrite 0 \
--sensitive 1 \
--anno 1 \
--evaluate 1 \
--threads 48 \
--debug 1

# Final combined repeat identification and repeatitive sequence masking 
cat $rep_file.final ${genome}.fna.mod.EDTA.TElib.fa > ${genome}_repeat_lib.fasta

Annotation of structurally identified repeatitive elements

RepeatMasker -nolow -no_is -norna -engine ncbi -lib ${genome}_repeat_lib.fasta ${genome}.fna -pa 12 -gff -poly -u -xsmall

RM2Bed.py ${genome}.fna.out
bedtools getfasta -fi ${genome}.fna -bed ${genome}.rm.bed -fo ${genome}.RM.final.out.fasta  

### use the hard masked coordinate bed file for soft masking in bedtools 
# bedtools maskfasta \
# -fi ${genome}.fna \
# -bed ${genome}.rm.bed -fo ${genome}.softmasked.1.fna -soft

# Annotate RepeatMasker output repeats using TEsorter
TEsorter \
${genome}.RM.final.out.fasta \
--seq-type nucl \
--processors 48 \
--pass2-rule 80-80-80 \
--prefix ${genome}_RM_tesorter.out

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

README.md

README.md

Structural identification and annotation repeatitive elements

Annotation of structurally identified repeatitive elements

Files

README.md

Latest commit

History

README.md

File metadata and controls

Structural identification and annotation repeatitive elements

Annotation of structurally identified repeatitive elements