Skip to content

Commit

Permalink
Allow fasta input to be gzip
Browse files Browse the repository at this point in the history
  • Loading branch information
dariober committed Sep 20, 2024
1 parent 883c7d3 commit 765ceab
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 60 deletions.
51 changes: 36 additions & 15 deletions packages/apollo-cli/src/commands/assembly/add-from-fasta.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ import { queryApollo, submitAssembly } from '../../utils.js'
import { Response } from 'undici'

export default class AddFasta extends FileCommand {
static description = `Add new assembly from a fasta file. The input file may be:
static summary = 'Add a new assembly from fasta input'
static description = `Add new assembly. The input fasta may be:
* A local file
* An external fasta file
* The id of a file previously uploaded to Apollo`
Expand All @@ -27,7 +28,7 @@ export default class AddFasta extends FileCommand {
]

static args = {
'input-file': Args.string({
input: Args.string({
description:
'Input fasta file, local or remote, or id of a previously uploaded file',
required: true,
Expand Down Expand Up @@ -61,6 +62,18 @@ Indexes should be named <my.fasta.gz>.gzi and <my.fasta.gz>.fai unless options -
gzi: Flags.string({
description: 'Gzi index of the (not-editable) fasta file',
}),
gzip: Flags.boolean({
char: 'z',
description:
'For local file input: Override autodetection and instruct that input is gzip compressed',
exclusive: ['decompressed'],
}),
decompressed: Flags.boolean({
char: 'd',
description:
'For local file input: Override autodetection and instruct that input is decompressed',
exclusive: ['gzip'],
}),
}

public async run(): Promise<void> {
Expand All @@ -70,14 +83,14 @@ Indexes should be named <my.fasta.gz>.gzi and <my.fasta.gz>.fai unless options -
const access: { address: string; accessToken: string } =
await this.getAccess(flags['config-file'], flags.profile)

const assemblyName = flags.assembly ?? path.basename(args['input-file'])
const assemblyName = flags.assembly ?? path.basename(args.input)

const fastaIsFileId = await isFileId(
args['input-file'],
args.input,
access.address,
access.accessToken,
)
const isExternal = isValidHttpUrl(args['input-file'])
const isExternal = isValidHttpUrl(args.input)

let body
if (isExternal) {
Expand All @@ -90,13 +103,13 @@ Indexes should be named <my.fasta.gz>.gzi and <my.fasta.gz>.fai unless options -
assemblyName,
typeName: 'AddAssemblyFromExternalChange',
externalLocation: {
fa: args['input-file'],
fa: args.input,
fai: flags.index,
},
}
} else if (flags['not-editable']) {
const gzi = flags.gzi ?? `${args['input-file']}.gzi`
const fai = flags.fai ?? `${args['input-file']}.fai`
const gzi = flags.gzi ?? `${args.input}.gzi`
const fai = flags.fai ?? `${args.input}.fai`

const gziIsFileId = await isFileId(
gzi,
Expand All @@ -121,11 +134,11 @@ Indexes should be named <my.fasta.gz>.gzi and <my.fasta.gz>.fai unless options -
}

const faId = fastaIsFileId
? args['input-file']
? args.input
: await this.uploadFile(
access.address,
access.accessToken,
args['input-file'],
args.input,
'application/x-bgzip-fasta',
true,
)
Expand Down Expand Up @@ -160,17 +173,25 @@ Indexes should be named <my.fasta.gz>.gzi and <my.fasta.gz>.fai unless options -
},
}
} else {
if (!isExternal && !fs.existsSync(args['input-file']) && !fastaIsFileId) {
this.error(`Input "${args['input-file']}" is not valid`)
if (!isExternal && !fs.existsSync(args.input) && !fastaIsFileId) {
this.error(`Input "${args.input}" is not valid`)
}
let isGzip = args.input.endsWith('.gz')
if (flags.gzip) {
isGzip = true
}
if (flags.decompressed) {
isGzip = false
}

const fileId = fastaIsFileId
? args['input-file']
? args.input
: await this.uploadFile(
access.address,
access.accessToken,
args['input-file'],
args.input,
'text/x-fasta',
false,
isGzip,
)
body = {
assemblyName,
Expand Down
4 changes: 2 additions & 2 deletions packages/apollo-cli/src/commands/file/upload.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ import { Response } from 'undici'

import { FileCommand } from '../../fileCommand.js'
import { filterJsonList, queryApollo } from '../../utils.js'
import { gzip } from 'node:zlib'

export default class Upload extends FileCommand {
static summary = 'Upload a local file to the Apollo server'
Expand Down Expand Up @@ -81,6 +80,7 @@ export default class Upload extends FileCommand {
}

let isGzip = flags['input-file'].endsWith('.gz')
// eslint-disable-next-line unicorn/consistent-destructuring
if (flags.gzip) {
isGzip = true
}
Expand All @@ -96,7 +96,7 @@ export default class Upload extends FileCommand {
access.accessToken,
flags['input-file'],
type,
// eslint-disable-next-line unicorn/consistent-destructuring

isGzip,
)

Expand Down
76 changes: 40 additions & 36 deletions packages/apollo-cli/test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import json
import os
import sys
import shutil
import unittest
from utils import shell

Expand Down Expand Up @@ -906,40 +907,6 @@ def testFeatureChecksIndexed(self):
p = shell(f"{apollo} feature check {P} -i {xid}")
self.assertTrue("InternalStopCodonCheck" in p.stdout)

def testFeatureChecksIndexed(self):
shell(
f"{apollo} assembly add-from-fasta {P} -a v1 test_data/tiny.fasta.gz --not-editable -f"
)
shell(f"{apollo} feature import {P} -a v1 -i test_data/tiny.fasta.gff3 -d")
# shell(f"{apollo} assembly add-from-gff {P} test_data/tiny.fasta.gff3 -a v1 -f")
shell(f"{apollo} assembly check {P} -a v1 -c CDSCheck")
p = shell(f"{apollo} feature check {P} -a v1")
## If we don't edit a feature, checks are not activated (!?)
self.assertEqual(p.stdout.strip(), "[]")

p = shell(f"{apollo} feature get {P} -a v1")
ff = json.loads(p.stdout)
g1 = [x for x in ff if x["gffId"] == "MyGene"][0]
g2 = [x for x in ff if x["gffId"] == "AnotherGene"][0]

shell(f"{apollo} feature edit-coords {P} -i {g1['_id']} -e 201")
shell(f"{apollo} feature edit-coords {P} -i {g2['_id']} -e 251")
p = shell(f"{apollo} feature check {P} -a v1")
out = json.loads(p.stdout)
self.assertTrue(len(out) > 1)
self.assertTrue("InternalStopCodonCheck" in p.stdout)

## Ids with checks
ids = []
for x in out:
ids.extend(x["ids"])
self.assertTrue(len(set(ids)) > 1)

## Retrieve by feature id
xid = " ".join(ids)
p = shell(f"{apollo} feature check {P} -i {xid}")
self.assertTrue("InternalStopCodonCheck" in p.stdout)

def testUser(self):
p = shell(f"{apollo} user get {P}")
out = json.loads(p.stdout)
Expand Down Expand Up @@ -1072,11 +1039,46 @@ def testFileUploadGzip(self):
# Uploading a gzip file must skip compression and just copy the file
with open("test_data/tiny.fasta.gz", "rb") as gz:
md5 = hashlib.md5(gz.read()).hexdigest()
p = shell(f"{apollo} file upload {P} -i test_data/tiny.fasta.gz -t text/x-fasta")
p = shell(
f"{apollo} file upload {P} -i test_data/tiny.fasta.gz -t text/x-fasta"
)
out = json.loads(p.stdout)
self.assertEqual(md5, out["checksum"])
shell(f"{apollo} assembly add-from-fasta {P} -f {out['_id']}")

def testAddAssemblyGzip(self):
# Autodetect format
shell(f"{apollo} assembly add-from-fasta {P} test_data/tiny.fasta.gz -f -a vv1")
p = shell(f"{apollo} assembly sequence {P} -a vv1")
self.assertTrue(p.stdout.startswith(">"))
self.assertTrue("cattgttgcggagttgaaca" in p.stdout)

# Skip autodetect
shutil.copy("test_data/tiny.fasta", "test_data/tmp.gz")
shell(
f"{apollo} assembly add-from-fasta {P} test_data/tmp.gz -f -a vv1 --decompressed"
)
p = shell(f"{apollo} assembly sequence {P} -a vv1")
self.assertTrue(p.stdout.startswith(">"))
self.assertTrue("cattgttgcggagttgaaca" in p.stdout)
os.remove("test_data/tmp.gz")

shutil.copy("test_data/tiny.fasta.gz", "test_data/fasta.tmp")
shell(
f"{apollo} assembly add-from-fasta {P} test_data/fasta.tmp -f -a vv1 --gzip"
)
p = shell(f"{apollo} assembly sequence {P} -a vv1")
self.assertTrue(p.stdout.startswith(">"))
self.assertTrue("cattgttgcggagttgaaca" in p.stdout)

# Autodetect false positive
p = shell(
f"{apollo} assembly add-from-fasta {P} test_data/fasta.tmp -f -a vv1",
strict=False,
)
self.assertTrue(p.returncode != 0)
os.remove("test_data/fasta.tmp")

def testAddAssemblyFromFilesNotEditable(self):
# It would be good to check that really there was no sequence loading
shell(
Expand All @@ -1102,7 +1104,9 @@ def testAddAssemblyFromFilesNotEditable(self):

def testAddAssemblyFromFileIdsNotEditable(self):
# Upload and get Ids for: bgzip fasta, fai and gzi
p = shell(f"{apollo} file upload {P} -i test_data/tiny.fasta.gz -t application/x-bgzip-fasta")
p = shell(
f"{apollo} file upload {P} -i test_data/tiny.fasta.gz -t application/x-bgzip-fasta"
)
fastaId = json.loads(p.stdout)["_id"]

p = shell(f"{apollo} file upload {P} -i test_data/tiny.fasta.gz.fai")
Expand Down
18 changes: 11 additions & 7 deletions packages/website/docs/cli/assembly.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,44 @@

Commands to manage assemblies

- [`apollo assembly add-from-fasta INPUT-FILE`](#apollo-assembly-add-from-fasta-input-file)
- [`apollo assembly add-from-fasta INPUT`](#apollo-assembly-add-from-fasta-input)
- [`apollo assembly add-from-gff INPUT-FILE`](#apollo-assembly-add-from-gff-input-file)
- [`apollo assembly check`](#apollo-assembly-check)
- [`apollo assembly delete`](#apollo-assembly-delete)
- [`apollo assembly get`](#apollo-assembly-get)
- [`apollo assembly sequence`](#apollo-assembly-sequence)

## `apollo assembly add-from-fasta INPUT-FILE`
## `apollo assembly add-from-fasta INPUT`

Add new assembly from a fasta file. The input file may be:
Add a new assembly from fasta input

```
USAGE
$ apollo assembly add-from-fasta INPUT-FILE [--profile <value>] [--config-file <value>] [-a <value>] [-x <value>] [-f] [-n]
[--fai <value>] [--gzi <value>]
$ apollo assembly add-from-fasta INPUT [--profile <value>] [--config-file <value>] [-a <value>] [-x <value>] [-f] [-n]
[--fai <value>] [--gzi <value>] [-z | -d]
ARGUMENTS
INPUT-FILE Input fasta file, local or remote, or id of a previously uploaded file
INPUT Input fasta file, local or remote, or id of a previously uploaded file
FLAGS
-a, --assembly=<value> Name for this assembly. Use the file name if omitted
-d, --decompressed For local file input: Override autodetection and instruct that input is decompressed
-f, --force Delete existing assembly, if it exists
-n, --not-editable The fasta sequence is not editable. Apollo will not load it into the database and instead
use the provided indexes to query it. This option assumes the fasta file is bgzip'd with
`bgzip` and indexed with `samtools faidx`. Indexes should be named <my.fasta.gz>.gzi and
<my.fasta.gz>.fai unless options --fai and --gzi are set
-x, --index=<value> URL of the index. Required if input is an external source
-z, --gzip For local file input: Override autodetection and instruct that input is gzip compressed
--config-file=<value> Use this config file (mostly for testing)
--fai=<value> Fasta index of the (not-editable) fasta file
--gzi=<value> Gzi index of the (not-editable) fasta file
--profile=<value> Use credentials from this profile
DESCRIPTION
Add new assembly from a fasta file. The input file may be:
Add a new assembly from fasta input
Add new assembly. The input fasta may be:
* A local file
* An external fasta file
* The id of a file previously uploaded to Apollo
Expand Down

0 comments on commit 765ceab

Please sign in to comment.