Skip to content

Commit

Permalink
Added comments. Changed variable names to better match data that is s…
Browse files Browse the repository at this point in the history
…tored. Added total number of classified and invalid CDSs to the 'results_statistics.tsv' output file. Limiting the number of values stored to better control memory usage while writing the 'results_contigsInfo.tsv' output file. Removed FASTA header integer conversion before BLASTp. Storing the seqids of the CDSs closest to contig tips during gene prediction to simplify LOTSC and PLOT determination. Improved info printed to stdout by the CreateSchema and AlleleCall modules. Improved locus identifier extraction from file paths.
  • Loading branch information
rfm-targa committed Feb 19, 2024
1 parent 82f8f98 commit 7cf5f26
Show file tree
Hide file tree
Showing 13 changed files with 1,375 additions and 1,276 deletions.
1,676 changes: 840 additions & 836 deletions CHEWBBACA/AlleleCall/allele_call.py

Large diffs are not rendered by default.

277 changes: 129 additions & 148 deletions CHEWBBACA/CreateSchema/create_schema.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions CHEWBBACA/ExtractCgMLST/determine_cgmlst.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,10 @@ def compute_cgMLST(matrix, sorted_genomes, threshold, step):
Dictionary with the number of genomes used to compute the
cgMLST as keys and the size of the core-genome as values.
"""
# determine genes at or above threshold
# Determine genes at or above threshold
cgMLST_size = {}
for i in im.inclusive_range(1, len(sorted_genomes), step):
# get subdataframe for current genomes
# Get subdataframe for current genomes
current_df = matrix.loc[sorted_genomes[:i]]
pa_rows, _ = current_df.shape
is_above_threshold = current_df.apply(above_threshold,
Expand Down
17 changes: 15 additions & 2 deletions CHEWBBACA/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@

GENETIC_CODES_DEFAULT = 11

# Proteins to cluster are divided into a maximum
# of 40 smaller groups in CreateSchema
# Dividing based on the number of CPU cores can lead to
# variable results because we do not have pre-defined clusters
# in CreateSchema.
CREATESCHEMA_CLUSTERING_NGROUPS = 40

# Valid FASTA file extensions
FASTA_EXTENSIONS = ['.fasta', '.fna', '.ffn', '.fa', '.fas']

Expand Down Expand Up @@ -193,6 +200,8 @@
# Headers for TSV files with paralogous loci count and per genome
PARALOGOUS_COUNTS_HEADER = 'Locus\tCount'
PARALOGOUS_LIST_HEADER = 'Genome\tLoci\tCDS'
# Header for TSV file with information about CDSs classified as ambiguous
MISSING_HEADER = 'Index\tGenome\tLocus\tLocus_classification\tCDS\tCDS_classification'

# Allele calling classifications
ALLELECALL_CLASSIFICATIONS = ['EXC', 'INF', 'PLOT3', 'PLOT5',
Expand All @@ -202,8 +211,8 @@
# PLNF classificaton for modes {1,2,3}
PROBABLE_LNF = 'PLNF'

# Regex pattern to match locus identifier
LOCUS_ID_PATTERN = r'.*-protein[0-9]+'
# Maximum number of values stored while creating the 'results_contigsInfo.tsv' file
RESULTS_CONTIGS_MAXVALS = 300000

# String template for a standard single line FASTA record
FASTA_RECORD_TEMPLATE = '>{0}\n{1}'
Expand Down Expand Up @@ -250,6 +259,10 @@
REPRESENTATIVE_DETERMINATION = 'Representative determination'
WRAPPING_UP = 'Wrapping up'

# CreateSchema exclusive section headers
EXCLUDE_SMALL = 'Short CDS removal'
FINAL_BLASTp = 'Final BLASTp'

# File header for file with summary statistics created by PrepExternalSchema
PREPEXTERNAL_SUMMARY_STATS_HEADER = ('Gene\tTotal_alleles\tValid_alleles\t'
'Number_representatives')
Expand Down
Loading

0 comments on commit 7cf5f26

Please sign in to comment.