"""Multi fastqc pipeline Author: Thomas Cokelaer Affiliation: Institut Pasteur @ 2019 This pipeline is part of Sequana software (sequana.readthedocs.io) """ import sequana from sequana import snaketools as sm # This must be defined before the include configfile: "config.yaml" # Generic include of some dynamic modules exec(open(sequana.modules["fastqc_dynamic"], "r").read()) # A convenient manager manager = sm.PipelineManager("fastqc", config) manager.setup(globals(), mode="error") rule pipeline: input: "multiqc/multiqc_report.html", ".sequana/rulegraph.svg", "summary.png" # FASTQC on input data set __fastqc_samples__input_fastq = manager.getrawdata() __fastqc_samples__output_done = "samples/{sample}/{sample}.done" __fastqc_samples__wkdir = "samples/{sample}" # manager.getwkdir("fastqc_samples") __fastqc_samples__log = "samples/%s/fastqc.log" % manager.sample include: fastqc_dynamic("samples", manager) comments = """

Number of samples: {}
Paired data: {}
Browse files here: tree """.format( len(manager.samples.keys()) , manager.paired) from sequana_pipelines.fastqc import version as v2 from sequana import version as v1 comments += """
Sequana version: {}""".format(v1) comments += """
Sequana_fastqc version: {}

""".format(v2) # Multiqc rule __multiqc2__input = expand(__fastqc_samples__output_done, sample=manager.samples) __multiqc2__logs = "multiqc/multiqc.log" __multiqc2__output = "multiqc/multiqc_report.html" __multiqc2__indir = config['multiqc']['indir'] __multiqc2__outdir = "multiqc" __multiqc2__config = "multiqc_config.yaml" # do not specify fastqc itself alone, otherwise it fails (feb 2020) __multiqc2__modules = "" config['multiqc']['options'] = "-m fastqc " + config["multiqc"]["options"].replace("-f", " ") + \ " --comment \"{}\" ".format(comments) include: sm.modules["multiqc2"] __rulegraph__input = manager.snakefile __rulegraph__output = ".sequana/rulegraph.svg" __rulegraph__mapper = {"multiqc2":"multiqc/multiqc_report.html"} include: sm.modules['rulegraph'] localrules: rulegraph rule plotting_and_stats: input: expand(__fastqc_samples__output_done, sample=manager.samples) output: "summary.png", "summary.json" run: import glob from sequana.fastqc import FastQC from sequana.summary import Summary from sequana_pipelines.fastqc import version summary = Summary("fastqc", caller="sequana_fastqc", sample_name="multi samples") summary.description = "summary sequana_fastqc pipeline" summary.pipeline_version = version filenames = glob.glob("samples/*/*.zip") f = FastQC() for sample in manager.samples: filenames = glob.glob("samples/{}/*zip".format(sample)) filenames = sorted(filenames) assert len(filenames) in [0, 1,2] if len(filenames) != 0: f.read_sample(filenames[0], sample) summary.data[sample] = f.fastqc_data[sample]['basic_statistics'] else: summary.data[sample] = { 'Filename': 'No fastqc found', 'File type': 'Conventional base calls', 'Encoding': 'Sanger / Illumina 1.9', 'Total Sequences': 0, 'Sequences flagged as poor quality': 0.0, 'Sequence length': '0', '%GC': 0, 'total_deduplicated_percentage': 0, 'mean_quality': 0, 'avg_sequence_length': 0} summary.to_json("summary.json") f.plot_sequence_quality() from pylab import savefig, gcf f = gcf() f.set_size_inches(10,6) savefig(output[0], dpi=200) # Those rules takes a couple of seconds so no need for a cluster localrules: multiqc2, rulegraph onsuccess: #shell("ln -f -s {} index.html".format(__multiqc2__output)) shell("rm -f ./samples/*/*.done") shell("rm -f ./samples/*/*.log") shell("chmod -R g+w .") # Create the tree.html file with all fastqc reports from sequana.utils.tree import HTMLDirectory hh = HTMLDirectory(".", pattern="fastqc.html") with open("tree.html", "w") as fout: fout.write(hh.get_html()) from sequana import logger logger.level = "INFO" # This should create the stats plot and the Makefile manager.teardown() manager.clean_multiqc(__multiqc2__output) # Now, the main HTML report import pandas as pd from sequana.utils.datatables_js import DataTable import json # Summary table with links towards fastqc data = json.load(open("summary.json", "r")) df = pd.DataFrame(data['data']) df = df.T df.drop(['File type', "Encoding", "Sequences flagged as poor quality"], axis=1, inplace=True) df['mean_quality'] = [int(float(x)*100)/100 for x in df['mean_quality']] df['total_deduplicated_percentage'] = [int(float(x)*100)/100 for x in df['total_deduplicated_percentage']] df = df.reset_index() df = df.rename({ "index": "sample", "total_deduplicated_percentage": "duplicated (%)"}, axis=1) df['link'] = ["samples/{}/{}_R1_001_fastqc.html".format(sample, sample) for sample in df['sample']] datatable = DataTable(df, 'fastqc', index=False) datatable.datatable.datatable_options = {'paging': 'false', 'buttons': ['copy', 'csv'], 'bSort': 'true', 'dom':"BRSPfrti" } datatable.datatable.set_links_to_column('link', 'sample') js = datatable.create_javascript_function() htmltable = datatable.create_datatable() # The summary table at the top from sequana_pipelines.fastqc import version as vv df_general = pd.DataFrame({ "samples": len(manager.samples), "paired": manager.paired, "sequana_fastqc_version": vv}, index=["summary"]) datatable = DataTable(df_general.T, 'general', index=True) datatable.datatable.datatable_options = {'paging': 'false', 'bFilter': 'false', 'bInfo': 'false', 'header': 'false', 'bSort': 'true'} js2 = datatable.create_javascript_function() htmltable2 = datatable.create_datatable(style="width: 20%; float:left" ) from sequana.modules_report.summary import SummaryModule2 data = { "name": manager.name, "rulegraph": __rulegraph__output, "stats": "stats.txt" } # Here the is main HTML page report contents = "

General information

" contents += """
{}
""".format(js2 + htmltable2) image = SummaryModule2.png_to_embedded_png("dummy", "summary.png", style="width:80%; height:40%") contents += '
The following image shows the overall quality of your samples (R1 only).
{}
'.format(image) # the main table contents += """
""" contents += "

Here is a summary for all the samples. The CSV button allows you to export the basic statistics. {}
".format(js + htmltable) contents += """
Please look at the multiqc report for more details about your run.""" contents += """

Individual fastqc HTML reports for each sample

""" contents += hh.get_html() s = SummaryModule2(data, intro=contents) shell("rm -rf rulegraph") # embedded in report shell("rm -rf summary.png") # embedded in report onerror: print("An error occurred. See message above.")