Merge pull request #991 from milaboratory/final-fixes-4.2.0

Final fixes before 4.2.0 release
milaboratory · Jan 26, 2023 · b0f194e · b0f194e
2 parents 9ae760b + 5a6072d
commit b0f194e
Show file tree

Hide file tree

Showing 7 changed files with 71 additions and 32 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,4 @@ build
 mi.license
 .mi.license
 Ig-4_S4.*
+hash_projection.txt
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -98,12 +98,12 @@ repositories {
     }
 }
 
-val milibVersion = "2.2.0-46-master"
-val repseqioVersion = "1.6.0-6-master"
+val milibVersion = "2.3.0"
+val repseqioVersion = "1.7.0"
 val miplotsVersion = "1.2.0"
-val mitoolVersion = "1.5.0-58-main"
+val mitoolVersion = "1.6.0"
 val jacksonBomVersion = "2.14.1"
-val redberryPipeVersion = "1.3.0-18-refactoring"
+val redberryPipeVersion = "1.4.0"
 
 dependencies {
     api("cc.redberry:pipe:$redberryPipeVersion")
@@ -119,7 +119,7 @@ dependencies {
     api("com.milaboratory:miplots:$miplotsVersion")
 
     // implementation("com.milaboratory:milm2-jvm:1.0-SNAPSHOT") { isChanging = true }
-    implementation("com.milaboratory:milm2-jvm:2.5.0")
+    implementation("com.milaboratory:milm2-jvm:2.7.0")
 
     implementation(platform("com.fasterxml.jackson:jackson-bom:$jacksonBomVersion"))
     implementation("com.fasterxml.jackson.module:jackson-module-kotlin")

diff --git a/CHANGELOG → changelogs/old/CHANGELOG b/CHANGELOG → changelogs/old/CHANGELOG
diff --git a/CHANGELOG_CURRENT → changelogs/old/CHANGELOG_CURRENT b/CHANGELOG_CURRENT → changelogs/old/CHANGELOG_CURRENT
diff --git a/changelogs/v4.2.0.md b/changelogs/v4.2.0.md
@@ -16,32 +16,29 @@ Complete support of [sample barcodes](https://docs.milaboratories.com/mixcr/refe
 - from sequence header lines;
 - from inside the [tag pattern](https://docs.milaboratories.com/mixcr/reference/overview-built-in-presets/ref-tag-pattern/).
 
-Now one can analyze multiple patient samples at once. Along with a powerful [file name expansion](https://docs.milaboratories.com/mixcr/reference/overview-built-in-presets/ref-input-file-name-expansion/) functionality, one can process any kind of sequencing protocol with any custom combination of sample, cell and UMI barcoding.   
+Now one can analyze multiple patient samples at once. Along with a powerful [file name expansion](https://docs.milaboratories.com/mixcr/reference/overview-built-in-presets/ref-input-file-name-expansion/) functionality, one can process any kind of sequencing protocol with any custom combination of sample, cell and UMI barcoding.
 
 Processing of multiple samples can be done in two principal modes in respect to sample barcodes: (1) data can be split by samples right on the `align` stage and processed separately, or (2) all samples can be processed as a single set of sequences and separated only on  the very last `exportClones` step, both approaches have their pros and cons allowing to use the best strategy given the experimental setup and study goals.
 
 # New robust filters for single cell and molecular barcoded data
 
-For 10x Genomics and other fragmented protocols, a new powerful k-mer based filtering algorithm is now used to eliminate cross-cell contamination coming from plasmatic cells.  
+For 10x Genomics and other fragmented protocols, a new powerful k-mer based filtering algorithm is now used to eliminate cross-cell contamination coming from plasmatic cells.
 
 For UMI filtering, a new algorithm from the paper by [J. Barron (2020)](https://arxiv.org/abs/2007.07350) allows for better automated histogram thresholding in barcoded data filtering.
 
 
-
-
 # List of all changes
 
 ## Sample barcodes
 
 - support for more than two `fastq` files as input (`I1` and `I2` reads support)
 - multiple possible sources of data for sample resolution:
-    - sequences extracted with tag pattern (including those coming from `I1` and `I2` reads)
-    - samples can be based on specific pattern variant (with multi-variant patterns, separated by `||`, allows to easily adopt MiGEC-style sample files)
-    - [TBD] strings extracted from fastq description line
-    - parts of file names (extracted using file name expansion mechanism)
+  - sequences extracted with tag pattern (including those coming from `I1` and `I2` reads)
+  - samples can be based on specific pattern variant (with multi-variant patterns, separated by `||`, allows to easily adopt MiGEC-style-like sample files)
+  - parts of file names (extracted using file name expansion mechanism)
 - flexible sample table matching criteria
-    - matching multiple tags
-    - matching variant id from multi-variant tag patterns
+  - matching multiple tags
+  - matching variant id from multi-variant tag patterns
 - special `--sample-table` mixin option allowing for flexible sample table definition in a tab-delimited table form
 - special `--infer-sample-table` mixin option to infer sample table for sample tags from file name expansion
 - special generic presets for multiplexed data analysis scenarios (e.g. `generic-tcr-amplicon-separate-samples-umi`)
@@ -62,8 +59,7 @@ For UMI filtering, a new algorithm from the paper by [J. Barron (2020)](https://
 
 ## Reference library
 
-- reference V/D/J/C gene library upgrade to repseqio v2.1 (
-  see [changelog](https://github.com/repseqio/library/releases/tag/v2.1))
+- reference V/D/J/C gene library upgrade to repseqio v2.1 (see [changelog](https://github.com/repseqio/library/releases/tag/v2.1))
 
 ## New commands
 
@@ -73,13 +69,10 @@ For UMI filtering, a new algorithm from the paper by [J. Barron (2020)](https://
 
 - optimized aligner parameters for long-read data
 - fixed system temp folder detection behaviour, now mixcr respects `TMPDIR` environment variable
-- rework of preset-mixin logic, now external presets (like those starting from `local:...`) are packed into the output
-  `*.vdjca` file on `align` step, the same applies to all externally linked information, like tag whitelists and
-  sample lists. This behaviour facilitates better analysis reproducibility and more transparent parameter logistics.
+- rework of preset-mixin logic, now external presets (like those starting from `local:...`) are packed into the output `*.vdjca` file on `align` step, the same applies to all externally linked information, like tag whitelists and sample lists. This behaviour facilitates better analysis reproducibility and more transparent parameter logistics.
 - new mixin options to adjust tag refinement whitelists with `analyze`: `--set-whitelist` and `--reset-whitelist`
 - removed `refineTagsAndSort` options `-w` and `--whitelist`; corresponding deprecation error message printed if used
-- new grouping feature for `exportClones`, allowing to normalize values for `-readFraction` and `-uniqueTagFraction ...`
-  columns to totals for certain compartments instead of normalizing to the whole dataset. This feature allows to output e.g. fractions of reads inside the cell.
+- new grouping feature for `exportClones`, allowing to normalize values for `-readFraction` and `-uniqueTagFraction ...` columns to totals for certain compartments instead of normalizing to the whole dataset. This feature allows to output e.g. fractions of reads inside the cell.
 - new mixin options `--add-export-clone-table-splitting`, `--reset-export-clone-table-splitting`, `--add-export-clone-grouping` and `--reset-export-clone-grouping`
 - improved sensitivity of `findAlleles` command
 - add tags info in `exportAlignmentsPretty` and `exportClonesPretty`

diff --git a/src/main/kotlin/com/milaboratory/mixcr/cli/CommandAlign.kt b/src/main/kotlin/com/milaboratory/mixcr/cli/CommandAlign.kt
@@ -60,7 +60,6 @@ import com.milaboratory.mixcr.basictypes.VDJCAlignmentsWriter
 import com.milaboratory.mixcr.basictypes.VDJCHit
 import com.milaboratory.mixcr.basictypes.tag.TagCount
 import com.milaboratory.mixcr.basictypes.tag.TagTuple
-import com.milaboratory.mixcr.basictypes.tag.TagType
 import com.milaboratory.mixcr.cli.CommandAlign.Cmd.InputType.BAM
 import com.milaboratory.mixcr.cli.CommandAlign.Cmd.InputType.Fasta
 import com.milaboratory.mixcr.cli.CommandAlign.Cmd.InputType.PairedEndFastq
@@ -73,7 +72,6 @@ import com.milaboratory.mixcr.cli.CommandAlignPipeline.ProcessingBundleStatus.No
 import com.milaboratory.mixcr.cli.CommandAlignPipeline.ProcessingBundleStatus.NotParsed
 import com.milaboratory.mixcr.cli.CommandAlignPipeline.ProcessingBundleStatus.SampleNotMatched
 import com.milaboratory.mixcr.cli.CommandAlignPipeline.cellSplitGroupLabel
-import com.milaboratory.mixcr.cli.CommandAlignPipeline.detectTagTypeByName
 import com.milaboratory.mixcr.cli.CommandAlignPipeline.getTagsExtractor
 import com.milaboratory.mixcr.cli.CommandAlignPipeline.listToSampleName
 import com.milaboratory.mixcr.cli.CommonDescriptions.DEFAULT_VALUE_FROM_PRESET
@@ -89,7 +87,6 @@ import com.milaboratory.util.ReportHelper
 import com.milaboratory.util.ReportUtil
 import com.milaboratory.util.SmartProgressReporter
 import com.milaboratory.util.limit
-import com.milaboratory.util.listComparator
 import io.repseq.core.Chains
 import io.repseq.core.GeneFeature.VRegion
 import io.repseq.core.GeneFeature.VRegionWithP
@@ -710,6 +707,10 @@ object CommandAlign {
 
         private val paramsSpec by lazy { MiXCRParamsSpec(presetName, mixins.mixins) }
 
+        /** Output file header will contain packed version of the parameter specs,
+        i.e. all external presets and will be packed into the spec object.*/
+        private val paramsSpecPacked by lazy { paramsSpec.pack() }
+
         private val bpPair by lazy { paramsResolver.resolve(paramsSpec, printParameters = logger.verbose) }
 
         private val cmdParams get() = bpPair.second
@@ -783,14 +784,19 @@ object CommandAlign {
                 }
                 ?: emptyList()
 
-            val keyParameter = cmdParams.tagPattern.toString()
+            val samplePattern = cmdParams.tagPattern.toString()
 
             return when (inputFileGroups.inputType) {
                 BAM -> {
                     if (inputFileGroups.fileGroups.size != 1)
                         throw ValidationException("File concatenation supported only for fastq files.")
                     val files = inputFileGroups.fileGroups.first().files
-                    MiXCRMain.lm.reportApplicationInputs(files, keyParameter, sampleDescriptions)
+                    MiXCRMain.lm.reportApplicationInputs(
+                        files,
+                        paramsSpecPacked.base.consistentHashString(),
+                        samplePattern,
+                        sampleDescriptions
+                    )
                     BAMReader(files.toTypedArray(), cmdParams.bamDropNonVDJ, cmdParams.replaceWildcards)
                         .map { ProcessingBundle(it) }
                 }
@@ -799,7 +805,12 @@ object CommandAlign {
                     if (inputFileGroups.fileGroups.size != 1 || inputFileGroups.fileGroups.first().files.size != 1)
                         throw ValidationException("File concatenation supported only for fastq files.")
                     val inputFile = inputFileGroups.fileGroups.first().files.first()
-                    MiXCRMain.lm.reportApplicationInputs(listOf(inputFile), keyParameter, sampleDescriptions)
+                    MiXCRMain.lm.reportApplicationInputs(
+                        listOf(inputFile),
+                        paramsSpecPacked.base.consistentHashString(),
+                        samplePattern,
+                        sampleDescriptions
+                    )
                     FastaSequenceReaderWrapper(
                         FastaReader(inputFile.toFile(), NucleotideSequence.ALPHABET),
                         cmdParams.replaceWildcards
@@ -808,7 +819,12 @@ object CommandAlign {
                 }
 
                 else -> { // All fastq file types
-                    MiXCRMain.lm.reportApplicationInputs(inputFileGroups.allFiles, keyParameter, sampleDescriptions)
+                    MiXCRMain.lm.reportApplicationInputs(
+                        inputFileGroups.allFiles,
+                        paramsSpecPacked.base.consistentHashString(),
+                        samplePattern,
+                        sampleDescriptions
+                    )
                     assert(inputFileGroups.fileGroups[0].files.size == inputFileGroups.inputType.numberOfReads)
                     FastqGroupReader(inputFileGroups.fileGroups, cmdParams.replaceWildcards, readBufferSize)
                         .map { ProcessingBundle(it.read, it.fileTags, it.originalReadId) }
@@ -918,6 +934,7 @@ object CommandAlign {
 
             // Attaching report to aligner
             aligner.setEventsListener(reportBuilder)
+
             use(
                 createReader(),
                 alignedWriter(outputFile),
@@ -944,9 +961,7 @@ object CommandAlign {
                 writers?.writeHeader(
                     MiXCRHeader(
                         inputHash,
-                        // Output file header will contain packed version of the parameter specs,
-                        // i.e. all external presets and will be packed into the spec object
-                        paramsSpec.pack(),
+                        paramsSpecPacked,
                         MiXCRStepParams().add(MiXCRCommandDescriptor.align, cmdParams),
                         tagsExtractor.tagsInfo,
                         aligner.parameters,

diff --git a/src/test/kotlin/com/milaboratory/mixcr/PresetsTest.kt b/src/test/kotlin/com/milaboratory/mixcr/PresetsTest.kt
@@ -1,6 +1,8 @@
 package com.milaboratory.mixcr
 
 import com.fasterxml.jackson.module.kotlin.readValue
+import com.milaboratory.cli.ParamsBundleSpecBaseAddress
+import com.milaboratory.cli.ParamsBundleSpecBaseEmbedded
 import com.milaboratory.mitool.helpers.K_OM
 import com.milaboratory.mitool.helpers.K_YAML_OM
 import com.milaboratory.mixcr.basictypes.tag.TagInfo
@@ -15,6 +17,8 @@ import io.kotest.assertions.withClue
 import org.junit.Assert
 import org.junit.Test
 import java.nio.file.Paths
+import kotlin.io.path.Path
+import kotlin.io.path.bufferedWriter
 import kotlin.io.path.listDirectoryEntries
 
 class PresetsTest {
@@ -39,6 +43,32 @@ class PresetsTest {
         }
     }
 
+    @Test
+    fun test2XConsistentHash() {
+        val mapOfHashes = mutableMapOf<String, MutableList<String>>()
+        for (presetName in Presets.nonAbstractPresetNames) {
+            val bundle = Presets.MiXCRBundleResolver.resolvePreset(presetName)
+            val bundleJson = K_OM.writeValueAsString(bundle)
+            val bundleDeserialized = K_OM.readValue(bundleJson, MiXCRParamsBundle::class.java)
+            Assert.assertEquals(bundle.hashCode(), bundleDeserialized.hashCode())
+            val asAddressSpec = ParamsBundleSpecBaseAddress<MiXCRParamsBundle>(presetName)
+            val asEmbeddedSpec = ParamsBundleSpecBaseEmbedded(bundle)
+            for (hash in listOf(asAddressSpec.consistentHashString(), asEmbeddedSpec.consistentHashString()))
+                mapOfHashes.compute(hash) { _, b ->
+                    val l = (b ?: mutableListOf())
+                    l += presetName
+                    l
+                }
+        }
+        Path("hash_projection.txt").bufferedWriter().use { writer ->
+            mapOfHashes.forEach { (hash, presets) ->
+                presets.forEach { preset ->
+                    writer.write("$hash\t$preset\n")
+                }
+            }
+        }
+    }
+
     @Test
     fun test3() {
         // val bundle = Presets.resolveParamsBundle("assemblePartial-universal")