tools-iuc/tools/falco/falco.xml at main · galaxyproject/tools-iuc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
<tool id="falco" name="Falco" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>An alternative, more performant implementation of FastQC for high throughput sequence quality control</description>
    <macros>
        <token name="@TOOL_VERSION@">1.3.0</token>
        <token name="@VERSION_SUFFIX@">0</token>
        <token name="@PROFILE@">25.0</token>
    </macros>
    <xrefs>
        <xref type="bio.tools">falco</xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">falco</requirement>
    </requirements>
    <command detect_errors="aggressive"><![CDATA[
        #import re
        #set input_name_sl = re.sub('[^\w\-\s]', '_', str($input_file.element_identifier))

        #if 'bam' in $input_file.ext:
            #set format = 'bam'
        #elif 'sam' in $input_file.ext:
            #set format = 'sam'
        #elif 'gz' in $input_file.ext:
            #set format = 'fastq.gz'
        #else
            #set format = 'fastq'
        #end if

        ln -s '${input_file}' '${input_name_sl}' &&
        falco
            #if $contaminants:
                --contaminants '${contaminants}'
            #end if

            #if $adapters.dataset and str($adapters) > ''
                --adapters '${adapters}'
            #end if

            #if $limits.dataset and str($limits) > ''
                --limits '${limits}'
            #end if
            --threads \${GALAXY_SLOTS:-2}
            --quiet
            ## #if $min_length:
            ##     --min_length $min_length
            ## #end if
            $nogroup
            ## --kmers $kmers
            -f '${format}'
            '${input_name_sl}'
            #if $subsample > 1:
              -subsample $subsample
            #end if
            $bisulfite
            $reverse_complement
            $generate_summary
    ]]></command>
    <inputs>
        <param format="fastq,fastq.gz,bam,sam" name="input_file" type="data" label="Raw read data from your current history"/>
        <param name="contaminants" type="data" format="tabular" optional="true" label="Contaminant list" help="tab delimited file with 2 columns: name and sequence.  For example: Illumina Small RNA RT Primer&#x9;CAAGCAGAAGACGGCATACGA"/>
        <param argument="--adapters" type="data" format="tabular" optional="true" label="Adapter list" help="List of adapters adapter sequences which will be explicity searched against the library. It should be a tab-delimited file with 2 columns: name and sequence."/>
        <param name="limits" type="data" format="txt" optional="true" label="Submodule and Limit specifing file" help="a file that specifies which submodules are to be executed (default=all) and also specifies the thresholds for the each submodules warning parameter."/>
        <param argument="--nogroup" type="boolean" truevalue="--nogroup" falsevalue="" checked="False" label="Disable grouping of bases for reads &gt;50bp" help=" Using this option, your plots may end up a ridiculous size. You have been warned!"/>
        <!-- Not implemented in falco yet <param argument="-min_length" type="integer" value="" optional="true" label="Lower limit on the length of the sequence to be shown in the report" help=" [NOT YET IMPLEMENTED IN FALCO]. Sets an artificial lower limit on the length of the sequence to be shown in the report. As long as you set this to a value greater or equal to your longest read length then this will be the sequence length used to create your read groups. This can be useful for making  directly comaparable statistics from datasets with somewhat variable read length."/> -->
        <!-- Ignored by falco and always set to 7 <param argument="-kmers" type="integer" value="7" min="2" max="10" label="Length of Kmer to look for" help="IGNORED BY FALCO AND ALWAYS SET TO 7. Specifies the length of Kmer to look for in the Kmer content module. Specified Kmer length must be between 2 and 10. Default length is 7 if not specified." /> -->
        <param argument="-subsample" type="integer" value="1" min="1" label="Subsampling Factor" help="This makes falco faster (but possibly less accurate) by only processing reads that are multiple of this value (using 0-based indexing to number reads)"/>
        <param argument="-bisulfite" type="boolean" truevalue="-bisulfite" falsevalue="" checked="False" label="Bisulfite Sequencing" help="This parameter indicates whether the reads are from whole genome bisulfite sequencing. When enabled, Falco will account for the expected increase in Ts and decrease in Cs in the base content."/>
        <param argument="reverse_complement" type="boolean" truevalue="-reverse-complement" falsevalue="" checked="False" label="Reverse Complement" help="This parameter specifies whether the input sequences are reverse-complemented. When enabled, all modules in Falco will be tested by swapping A/T and C/G."/>
        <param name="generate_summary" type="boolean" truevalue="" falsevalue="-skip-summary" checked="False" label="Generate summary output of QC test results"/>
    </inputs>
    <outputs>
        <data format="html" name="html_file" from_work_dir="fastqc_report.html" label="${tool.name} on ${on_string}: Webpage"/>
        <data format="txt" name="text_file" from_work_dir="fastqc_data.txt" label="${tool.name} on ${on_string}: RawData"/>
        <data format="txt" name="summary_file" from_work_dir="summary.txt" label="${tool.name} on ${on_string}: SummaryData">
            <filter>generate_summary</filter>
        </data>
    </outputs>
    <tests>
        <!-- Test with fastq input -->
        <test expect_num_outputs="2">
            <param name="input_file" value="1000trimmed.fastq"/>
            <output name="html_file" ftype="html">
                <assert_contents>
                  <has_line_matching expression="\s*1000trimmed_fastq - report"/>
                </assert_contents>
            </output>
            <!-- two lines diff to allow for reported version to change -->
            <output name="text_file" file="fastqc_data.txt" ftype="txt" lines_diff="2"/>
        </test>
        <!-- Test with fastq.gz input -->
        <test expect_num_outputs="2">
            <param name="input_file" value="1000trimmed.fastq.gz"/>
            <output name="html_file" ftype="html">
                <assert_contents>
                  <has_line_matching expression="\s*1000trimmed_fastq_gz - report"/>
                </assert_contents>
            </output>
            <!-- four lines diff to allow for reported version to change; two more to accomodate changed input file name -->
            <output name="text_file" file="fastqc_data.txt" ftype="txt" lines_diff="4"/>
        </test>
        <!-- Test with BAM input -->
        <test expect_num_outputs="2">
            <param name="input_file" value="hisat_output_1.bam"/>
            <output name="html_file" ftype="html">
                <assert_contents>
                  <has_line_matching expression="\s*hisat_output_1_bam - report"/>
                </assert_contents>
            </output>
            <!-- four lines diff to allow for reported version to change; two more to accomodate changed input file name -->
            <output name="text_file" file="fastqc_data_hisat.txt" ftype="txt" lines_diff="4"/>
        </test>
        <!-- Test summary file option -->
        <test expect_num_outputs="3">
            <param name="input_file" value="1000trimmed.fastq"/>
            <param name="generate_summary" value="true"/>
            <output name="html_file" ftype="html">
                <assert_contents>
                  <has_line_matching expression="\s*1000trimmed_fastq - report"/>
                </assert_contents>
            </output>
            <output name="text_file" file="fastqc_data.txt" ftype="txt" lines_diff="2"/>
            <output name="summary_file" file="fastqc_data_summary.txt" ftype="txt"/>
        </test>
        <test expect_num_outputs="2">
            <param name="input_file" value="1000trimmed.fastq"/>
            <param name="contaminants" value="contaminant_list.txt" ftype="tabular"/>
            <output name="html_file" ftype="html">
                <assert_contents>
                  <has_line_matching expression="\s*1000trimmed_fastq - report"/>
                </assert_contents>
            </output>
            <output name="text_file" file="fastqc_data_contaminants.txt" ftype="txt" lines_diff="2"/>
        </test>
        <test expect_num_outputs="2">
            <param name="input_file" value="1000trimmed.fastq"/>
            <param name="adapters" value="adapter_list.txt" ftype="tabular"/>
            <output name="html_file" ftype="html">
                <assert_contents>
                  <has_line_matching expression="\s*1000trimmed_fastq - report"/>
                </assert_contents>
            </output>
            <output name="text_file" file="fastqc_data_adapters.txt" ftype="txt" lines_diff="2"/>
        </test>
        <test expect_num_outputs="3">
            <param name="input_file" value="1000trimmed.fastq"/>
            <param name="limits" value="limits.txt" ftype="txt"/>
            <param name="generate_summary" value="true"/>
            <output name="html_file" ftype="html">
                <assert_contents>
                  <has_line_matching expression="\s*1000trimmed_fastq - report"/>
                </assert_contents>
            </output>
            <output name="summary_file" file="fastqc_data_customlimits_summary.txt" ftype="txt"/>
        </test>
        <!-- ## The kmers param is ignored in Falco and always set to 7. If this ever gets reconsidered, this test could be uncommented.
        <test expect_num_outputs="2">
            <param name="input_file" value="1000trimmed.fastq" ftype="fastq"/>
            <param name="kmers" value="7"/>
            <param name="limits" value="limits.txt" ftype="txt"/>
            <output name="html_file" file="fastqc_report_kmer.html" ftype="html" lines_diff="2"/>
            <output name="text_file" file="fastqc_data_kmer.txt" ftype="txt"/>
            <assert_command>
                <has_text text="kmers 7"/>
            </assert_command>
        </test>
            ## The min_length param is not yet implemented in Falco.
               Once it is, this test may be uncommented.
        <test expect_num_outputs="2">
            <param name="input_file" value="1000trimmed.fastq"/>
            <param name="min_length" value="108"/>
            <output name="html_file" file="fastqc_report_min_length.html" ftype="html" lines_diff="2"/>
            <output name="text_file" file="fastqc_data_min_length.txt" ftype="txt"/>
        </test> -->
        <test expect_num_outputs="2">
            <param name="input_file" value="1000trimmed.fastq" ftype="fastq"/>
            <param name="nogroup" value="true"/>
            <output name="html_file" ftype="html">
                <assert_contents>
                  <has_line_matching expression="\s*1000trimmed_fastq - report"/>
                </assert_contents>
            </output>
            <output name="text_file" file="fastqc_data_nogroup.txt" ftype="txt" lines_diff="2"/>
        </test>
        <test expect_num_outputs="3">
            <param name="input_file" value="1000trimmed.fastq"/>
            <param name="subsample" value="10"/>
            <param name="generate_summary" value="true"/>
            <output name="html_file" ftype="html">
                <assert_contents>
                  <has_line_matching expression="\s*1000trimmed_fastq - report"/>
                </assert_contents>
            </output>
            <output name="text_file" file="fastqc_report_subsample.txt" ftype="txt" lines_diff="2"/>
            <output name="summary_file" file="fastqc_report_subsample_summary.txt" ftype="txt"/>
        </test>
        <test expect_num_outputs="3">
            <param name="input_file" value="1000trimmed.fastq"/>
            <param name="bisulfite" value="true"/>
            <param name="generate_summary" value="true"/>
            <output name="html_file" ftype="html">
                <assert_contents>
                  <has_line_matching expression="\s*1000trimmed_fastq - report"/>
                </assert_contents>
            </output>
            <output name="text_file" file="fastqc_report_bisulfite.txt" ftype="txt" lines_diff="2"/>
            <output name="summary_file" file="fastqc_report_bisulfite_summary.txt" ftype="txt"/>
        </test>
        <test expect_num_outputs="2">
            <param name="input_file" value="1000trimmed.fastq"/>
            <param name="reverse_complement" value="true"/>
            <output name="html_file" ftype="html">
                <assert_contents>
                  <has_line_matching expression="\s*1000trimmed_fastq - report"/>
                </assert_contents>
            </output>
            <output name="text_file" file="fastqc_report_reverse_complement.txt" ftype="txt" lines_diff="2"/>
        </test>
    </tests>
    <help><![CDATA[
**What it does**

Falco_ is a high-speed emulation of the popular FastQC software for quality control of sequencing data.

💚️ With its superior performance Falco saves computational resources and gives you back results faster than FastQC.

We recommend it for most use cases (but see below for rare exceptions). 💚️

The main functions of Falco are very similar to those of FastQC:

- Import of data from BAM, SAM or FastQ/FastQ.gz files (any variant),
- Providing a quick overview to tell you in which areas there may be problems
- Summary graphs and tables to quickly assess your data
- Export of results to an HTML-based report


.. class:: infomark

The plain text report generated by Falco can be used as a "FastQC" report in MultiQC and its data is very similar though not 100% identical to that generated by FastQC on the same inputs.

.. class:: Warning mark

    In the following situations, FastQC is still a better solution than this version of Falco:

- your input is bz2-compressed fastq

  Falco doesn't currently support fastq.bz2 as input format meaning Galaxy has to perform a relatively slow format conversion before running the tool, which together makes the analysis slower than with FastQC.

- you need the HTML report to be viewable offline

  The current version of Falco relies on plotly to generate the graphs in the HTML report dynamically each time it's viewed.
  MultiQC plots generated from Falco's raw data output are, of course, viewable offline just like the ones generated from FastQC output.

-----

**Inputs and outputs**

The Falco_ development repo includes very good documentation.
A summary of it follows below for those in a tearing hurry.

This wrapper will accept a Galaxy fastq, fastq.gz, sam or bam as the input read file to check.
It will also take an optional file containing a list of contaminants information, in the form of
a tab-delimited file with 2 columns, name and sequence. As another option the tool takes a custom
limits.txt file that allows setting the warning thresholds for the different modules and also specifies
which modules to include in the output.

The tool produces a basic text and a HTML output file that contain all of the results, including the following:

- Basic Statistics
- Per base sequence quality
- Per sequence quality scores
- Per base sequence content
- Per base GC content
- Per sequence GC content
- Per base N content
- Sequence Length Distribution
- Sequence Duplication Levels
- Overrepresented sequences
- Adapter Content

All except Basic Statistics and Overrepresented sequences are plots.
 .. _Falco: https://github.com/smithlabcode/falco/
    ]]></help>
    <citations>
        <citation type="doi">10.12688/f1000research.21142.2</citation>
    </citations>
</tool>