Added comments. Changed variable names to better match data that is s…

…tored. Added total number of classified and invalid CDSs to the 'results_statistics.tsv' output file. Limiting the number of values stored to better control memory usage while writing the 'results_contigsInfo.tsv' output file. Removed FASTA header integer conversion before BLASTp. Storing the seqids of the CDSs closest to contig tips during gene prediction to simplify LOTSC and PLOT determination. Improved info printed to stdout by the CreateSchema and AlleleCall modules. Improved locus identifier extraction from file paths.
B-UMMI · Feb 19, 2024 · 7cf5f26 · 7cf5f26
1 parent 82f8f98
commit 7cf5f26
Show file tree

Hide file tree

Showing 13 changed files with 1,375 additions and 1,276 deletions.
diff --git a/CHEWBBACA/AlleleCall/allele_call.py b/CHEWBBACA/AlleleCall/allele_call.py
diff --git a/CHEWBBACA/CreateSchema/create_schema.py b/CHEWBBACA/CreateSchema/create_schema.py
diff --git a/CHEWBBACA/ExtractCgMLST/determine_cgmlst.py b/CHEWBBACA/ExtractCgMLST/determine_cgmlst.py
@@ -186,10 +186,10 @@ def compute_cgMLST(matrix, sorted_genomes, threshold, step):
         Dictionary with the number of genomes used to compute the
         cgMLST as keys and the size of the core-genome as values.
     """
-    # determine genes at or above threshold
+    # Determine genes at or above threshold
     cgMLST_size = {}
     for i in im.inclusive_range(1, len(sorted_genomes), step):
-        # get subdataframe for current genomes
+        # Get subdataframe for current genomes
         current_df = matrix.loc[sorted_genomes[:i]]
         pa_rows, _ = current_df.shape
         is_above_threshold = current_df.apply(above_threshold,

diff --git a/CHEWBBACA/utils/constants.py b/CHEWBBACA/utils/constants.py
@@ -85,6 +85,13 @@
 
 GENETIC_CODES_DEFAULT = 11
 
+# Proteins to cluster are divided into a maximum
+# of 40 smaller groups in CreateSchema
+# Dividing based on the number of CPU cores can lead to
+# variable results because we do not have pre-defined clusters
+# in CreateSchema.
+CREATESCHEMA_CLUSTERING_NGROUPS = 40
+
 # Valid FASTA file extensions
 FASTA_EXTENSIONS = ['.fasta', '.fna', '.ffn', '.fa', '.fas']
 
@@ -193,6 +200,8 @@
 # Headers for TSV files with paralogous loci count and per genome
 PARALOGOUS_COUNTS_HEADER = 'Locus\tCount'
 PARALOGOUS_LIST_HEADER = 'Genome\tLoci\tCDS'
+# Header for TSV file with information about CDSs classified as ambiguous
+MISSING_HEADER = 'Index\tGenome\tLocus\tLocus_classification\tCDS\tCDS_classification'
 
 # Allele calling classifications
 ALLELECALL_CLASSIFICATIONS = ['EXC', 'INF', 'PLOT3', 'PLOT5',
@@ -202,8 +211,8 @@
 # PLNF classificaton for modes {1,2,3}
 PROBABLE_LNF = 'PLNF'
 
-# Regex pattern to match locus identifier
-LOCUS_ID_PATTERN = r'.*-protein[0-9]+'
+# Maximum number of values stored while creating the 'results_contigsInfo.tsv' file
+RESULTS_CONTIGS_MAXVALS = 300000
 
 # String template for a standard single line FASTA record
 FASTA_RECORD_TEMPLATE = '>{0}\n{1}'
@@ -250,6 +259,10 @@
 REPRESENTATIVE_DETERMINATION = 'Representative determination'
 WRAPPING_UP = 'Wrapping up'
 
+# CreateSchema exclusive section headers
+EXCLUDE_SMALL = 'Short CDS removal'
+FINAL_BLASTp = 'Final BLASTp'
+
 # File header for file with summary statistics created by PrepExternalSchema
 PREPEXTERNAL_SUMMARY_STATS_HEADER = ('Gene\tTotal_alleles\tValid_alleles\t'
                                      'Number_representatives')