# coding: utf-8
#
# This file is part of Sequana software
#
# Copyright (c) 2016 - Sequana Development Team
#
# File author(s):
# Dimitri Desvillechabrol <dimitri.desvillechabrol@pasteur.fr>,
# <d.desvillechabrol@gmail.com>
#
# Distributed under the terms of the 3-clause BSD license.
# The full license is in the LICENSE file, distributed with this software.
#
# website: https://github.com/sequana/sequana
# documentation: http://sequana.readthedocs.io
#
##############################################################################
"""Module to write coverage report"""
import os
import colorlog
import pandas as pd
from sequana import bedtools
from sequana.modules_report.base_module import SequanaBaseModule
from sequana.modules_report.summary import SummaryModule
from sequana.plots.canvasjs_linegraph import CanvasJSLineGraph
from sequana.utils import config
from sequana.utils.datatables_js import DataTable, DataTableFunction
logger = colorlog.getLogger(__name__)
__all__ = ["CoverageModule", "ChromosomeCoverageModule"]
[docs]class CoverageModule(SequanaBaseModule):
"""Write HTML report of coverage analysis. This class takes either a
:class:`SequanaCoverage` instances or a csv file where analysis are stored.
"""
def __init__(self, data, region_window=200000):
""".. rubric:: constructor
:param data: it can be a csv filename created by sequana_coverage or a
:class:`bedtools.SequanaCoverage` object.
:param region_window:
"""
super().__init__()
self.region_window = region_window
if isinstance(data, bedtools.SequanaCoverage):
self.bed = data
else:
raise TypeError
if len(self.bed._html_list) == 0:
try:
html_list = self.create_chromosome_reports()
except TypeError:
msg = (
"Data must be either a csv file or a :class:`SequanaCoverage` " "instance where zscore is computed."
)
raise TypeError(msg)
else:
html_list = list(self.bed._html_list)
self.title = f"Main coverage report ({config.sample_name})"
self.intro = (
f"<p>Report the coverage of your sample ({config.sample_name}) to check the "
"quality of your mapping and to highlight regions of "
"interest (under and over covered).</p>"
)
# set the chromosome tables
self.sections = list()
self.create_chromosome_table(html_list)
# and create final HTML
self.create_html("sequana_coverage.html")
[docs] def create_chromosome_table(self, html_list):
"""Create table with links to chromosome reports"""
df = pd.DataFrame(
[
[
chrom,
self.bed._basic_stats[chrom]["length"],
self.bed._basic_stats[chrom]["DOC"],
self.bed._basic_stats[chrom]["CV"],
f"{chrom}/{chrom}.cov.html",
]
for chrom in self.bed.chrom_names
],
columns=["chromosome", "size", "mean_coverage", "coef_variation", "link"],
)
datatable = DataTable(df, "chrom")
datatable.datatable.datatable_options = {
"pageLength": 15,
"dom": "Bfrtip",
"buttons": ["copy", "csv"],
}
datatable.datatable.set_links_to_column("link", "chromosome")
js = datatable.create_javascript_function()
html_table = datatable.create_datatable(float_format="%.3g")
self.sections.append(
{
"name": "Chromosomes",
"anchor": "chromosomes",
"content": "<p>Link to coverage analysis report for each chromosome. "
"Size, mean coverage and coefficient of variation are reported"
" in the table below.</p>\n{0}\n{1}".format(js, html_table),
}
)
[docs] def create_chromosome_reports(self):
"""Create HTML report for each chromosome present in data."""
# We choose the first chromosome, to build a common javascript object
datatable_js = CoverageModule.init_roi_datatable(self.bed[0])
chrom_output_dir = config.output_dir
if not os.path.exists(chrom_output_dir):
os.makedirs(chrom_output_dir)
page_list = []
for chrom in self.bed:
logger.info(f"Creating coverage report {chrom.chrom_name}")
chrom_report = ChromosomeCoverageModule(chrom, datatable_js, region_window=self.region_window)
page_list.append(chrom_report.html_page)
return page_list
# a static method because we need it in the coverage standalone
# to initiate the datatables
[docs] def init_roi_datatable(rois):
"""Initiate :class:`DataTableFunction` to create table to link each
row with sub HTML report. All table will have the same appearance.
We can therefore initialise the roi once for all.
:param rois: can be a ROIs from ChromosomeCov instance or a simple
dataframe
"""
# computed
try:
df = rois.df.copy()
except:
df = rois.copy()
df["link"] = ""
# set datatable options
datatable_js = DataTableFunction(df, "roi")
if "start" in df.columns:
datatable_js.set_links_to_column("link", "start")
if "end" in df.columns:
datatable_js.set_links_to_column("link", "end")
datatable_js.datatable_options = {
"scrollX": "true",
"pageLength": 15,
"scrollCollapse": "true",
"dom": "Bfrtip",
"buttons": ["copy", "csv"],
}
return datatable_js
[docs]class ChromosomeCoverageModule(SequanaBaseModule):
"""Write HTML report of coverage analysis for each chromosome. It is
created by CoverageModule.
"""
def __init__(self, chromosome, datatable, region_window=200000, options=None, command="", skip_html=False):
"""
:param chromosome:
:param datatable:
:param directory:
:param int region_window: length of the sub coverage plot
:param options: should contain "W", "k", "circular"
"""
super().__init__()
self.region_window = region_window
directory = chromosome.chrom_name
# to define where are css and js
if directory in {None, "."}:
self.path = ""
directory = "."
else:
self.path = "../"
self.chromosome = chromosome
self.datatable = datatable
self.command = command
self.command += "\nSequana version: {}".format(config.version)
self.title = "Coverage analysis of chromosome {0}".format(self.chromosome.chrom_name)
self.intro = "<p>The genome coverage analysis of the chromosome " "<b>{0}</b>.</p>".format(
self.chromosome.chrom_name
)
self.create_report_content(directory, options=options)
if skip_html:
return
self.html_page = "{0}{1}{2}.cov.html".format(directory, os.sep, self.chromosome.chrom_name)
self.create_html(self.html_page)
# inform the main coverage instance that HTML is ready
self.chromosome.bed._html_list = self.chromosome.bed._html_list.union([self.html_page])
[docs] def create_report_content(self, directory, options=None):
"""Generate the sections list to fill the HTML report."""
self.sections = list()
if self.chromosome._mode == "memory":
# nothing to do, all computations should be already available
# and in memory
rois = self.chromosome.rois
elif self.chromosome._mode == "chunks":
# we need to reset the data to the first chunk
# and compute the median and zscore. So, first, we save the entire
# data set
# self.chromosome.reset()
# self.chromosome.running_median(options['W'], circular=options['circular'])
# self.chromosome.compute_zscore(options['k'])
# We mus set the ROI manually
rois = options["ROIs"]
if self.chromosome.DOC > 0:
self.coverage_plot()
if self.chromosome._mode == "memory":
links = self.subcoverage(rois, directory)
else:
links = None
else:
links = None
self.basic_stats()
if self.chromosome.DOC:
self.regions_of_interest(rois, links)
self.coverage_barplot()
if "gc" in self.chromosome.df.columns:
self.gc_vs_coverage()
self.normalized_coverage()
self.zscore_distribution()
self.add_command()
[docs] def add_command(self):
self.sections.append(
{
"name": "Command",
"anchor": "command",
"content": ("<p>Command used: <pre>{}</pre>.</p>".format(self.command)),
}
)
def _add_large_data_section(self, name, anchor):
self.sections.append(
{
"name": name,
"anchor": anchor,
"content": ("<p>Large data sets (--chunk-size argument " "used), skipped plotting.</p>"),
}
)
[docs] def coverage_plot(self):
"""Coverage section."""
if self.chromosome._mode == "chunks":
self._add_large_data_section("Coverage", "coverage")
return
image = self.create_embedded_png(self.chromosome.plot_coverage, input_arg="filename")
self.sections.append(
{
"name": "Coverage",
"anchor": "coverage",
"content": "<p>The following figures shows the per-base coverage along the"
" reference genome (black line). The blue line indicates the "
"running median. From the normalised coverage, we estimate "
"z-scores on a per-base level. The red lines indicates the "
"z-scores at plus or minus N standard deviations, where N is "
"chosen by the user. (default:4). Only a million point are "
"shown. This may explain some visual discrepancies with. </p>\n{0}".format(image),
}
)
[docs] def coverage_barplot(self):
"""Coverage barplots section."""
if self.chromosome._mode == "chunks":
self._add_large_data_section("Coverage histogram", "cov_barplot")
return
image1 = self.create_embedded_png(self.chromosome.plot_hist_coverage, input_arg="filename", style="width:45%")
image2 = self.create_embedded_png(
self.chromosome.plot_hist_coverage,
input_arg="filename",
style="width:45%",
logx=False,
)
self.sections.append(
{
"name": "Coverage histogram",
"anchor": "cov_barplot",
"content": "<p>The following figures contain the histogram of the genome "
"coverage. The X and Y axis being in log scale in the left panel"
"while only the Y axis is in log scale in the right panel.</p>\n"
"{0}\n{1}".format(image1, image2),
}
)
[docs] def subcoverage(self, rois, directory):
"""Create subcoverage reports to have access to a zoomable line plot.
:params rois:
:param directory: directory name for the chromsome
This method create sub reports for each region of 200,000 bases (can be
changed). Usually, it starts at position 0 so reports will be stored
in e.g. for a genome of 2,300,000 bases::
chromosome_name/chromosome_name_0_200000.html
chromosome_name/chromosome_name_200000_400000.html
...
...
chromosome_name/chromosome_name_2000000_2200000.html
chromosome_name/chromosome_name_2200000_2300000.html
Note that if the BED file positions does not start at zero, then
names will take care of that.
"""
# an aliases
W = self.region_window
name = self.chromosome.chrom_name
chrom = self.chromosome
N = len(self.chromosome)
# position does not always start at position zero
shift = self.chromosome.df.pos.iloc[0]
maxpos = self.chromosome.df.pos.iloc[-1]
# create directory
chrom_output_dir = os.sep.join([config.output_dir, str(directory), "subplots"])
if not os.path.exists(chrom_output_dir):
os.makedirs(chrom_output_dir)
# create the combobox to link toward different sub coverage
# Here, we should (1) get the length of the data and (2)
# figure out the boundary. Indeed, you can imagine a BED file
# that does not start at position zero, but from POS1>0 to POS2
links = ["subplots/{0}_{1}_{2}.html".format(name, i, min(i + W, maxpos)) for i in range(shift, shift + N, W)]
intra_links = ("{0}_{1}_{2}.html".format(name, i, min(i + W, maxpos)) for i in range(shift, shift + N, W))
combobox = self.create_combobox(links, "sub", True)
combobox_intra = self.create_combobox(intra_links, "sub", False)
datatable = self._init_datatable_function(rois)
# break the chromosome as pieces of 200,000 bp
for i in range(shift, shift + N, W):
SubCoverageModule(chrom, rois, combobox_intra, datatable, i, min(i + W, maxpos), directory)
self.sections.append({"name": "Subcoverage", "anchor": "subcoverage", "content": combobox})
return links
def _init_datatable_function(self, rois):
"""Initiate :class:`DataTableFunction` to create table to link each
row with sub HTML report. All table will have the same appearance. So,
let's initiate its only once.
"""
datatable_js = DataTableFunction(rois.df, "roi")
datatable_js.datatable_options = {
"scrollX": "true",
"pageLength": 15,
"scrollCollapse": "true",
"dom": "Bfrtip",
"buttons": ["copy", "csv"],
}
return datatable_js
[docs] def basic_stats(self):
"""Basics statistics section."""
li = "<li><b>{0}</b> ({1}): {2:.2f}</li>"
stats = self.chromosome.get_stats()
description = {
"BOC": "breadth of coverage: the proportion (in %s) " + " of the genome covered by at least one read.",
"CV": "the coefficient of variation.",
"DOC": "the sequencing depth (Depth of Coverage), that is the " + "average the genome coverage.",
"MAD": "median of the absolute median deviation defined as median(|X-median(X)|).",
"Median": "Median of the coverage.",
"STD": "standard deviation.",
"GC": "GC content (%)",
}
data = [
li.format(key, description[key], stats[key])
for key in ["BOC", "CV", "DOC", "MAD", "Median", "STD", "GC"]
if key in stats.keys()
]
stats = "<ul>{0}</ul>".format("\n".join(data))
self.sections.append(
{
"name": "Basic stats",
"anchor": "basic_stats",
"content": "<p>Here are some basic statistics about the " "genome coverage.</p>\n{0}".format(stats),
}
)
[docs] def regions_of_interest(self, rois, links):
"""Region of interest section."""
def connect_link(x):
for link in links:
_, x1, x2 = link.rsplit(os.sep)[1].rstrip(".html").rsplit("_", 2)
x1 = int(x1)
x2 = int(x2)
if x >= x1 and x <= x2:
return link
# for the case where the data is fully stored in memory, we must
# find all events !
if self.chromosome._mode == "memory" and self.chromosome.binning == 1:
raise Exception("{} position not in the range of reports".format(x))
if links:
links_list = [connect_link(n) for n in rois.df["start"]]
else:
links_list = [None for n in rois.df["start"]]
rois.df["link"] = links_list
# create datatable
low_roi = rois.get_low_rois()
high_roi = rois.get_high_rois()
datatable = CoverageModule.init_roi_datatable(low_roi)
datatable.set_links_to_column("link", "chr")
js = datatable.create_javascript_function()
lroi = DataTable(low_roi, "lroi", datatable)
hroi = DataTable(high_roi, "hroi", datatable)
html_low_roi = lroi.create_datatable(float_format="%.3g")
html_high_roi = hroi.create_datatable(float_format="%.3g")
rois.df.drop("link", axis=1, inplace=True)
roi_paragraph = (
"<p>Regions with a z-score {0}er than {1:.2f} and at "
"least one base with a z-score {0}er than {2:.2f} are detected."
"There are {3} {0} regions of interest."
"</p>"
)
low_paragraph = roi_paragraph.format(
"low",
self.chromosome.thresholds.low2,
self.chromosome.thresholds.low,
len(low_roi),
)
high_paragraph = roi_paragraph.format(
"high",
self.chromosome.thresholds.high2,
self.chromosome.thresholds.high,
len(high_roi),
)
self.sections.append(
{
"name": "Regions Of Interest (ROI)",
"anchor": "roi",
"content": "{4}\n"
"<p>The following tables give regions of "
"interest detected by sequana. Here are the definitions of the "
"columns:</p>\n"
"<ul><li>mean_cov: the average of coverage</li>\n"
"<li>mean_rm: the average of running median</li>\n"
"<li>mean_zscore: the average of zscore</li>\n"
"<li>max_zscore: the higher zscore contains in the region</li>"
"</ul>\n"
"<h3>Low coverage region</h3>\n{0}\n{1}\n"
"<h3>High coverage region</h3>\n{2}\n{3}\n".format(
low_paragraph, html_low_roi, high_paragraph, html_high_roi, js
),
}
)
[docs] def gc_vs_coverage(self):
"""3 dimensional plot of GC content versus coverage."""
image = self.create_embedded_png(self.chromosome.plot_gc_vs_coverage, input_arg="filename")
corr = self.chromosome.get_gc_correlation()
self.sections.append(
{
"name": "Coverage vs GC content",
"anchor": "cov_vs_gc",
"content": "<p>The correlation coefficient between the coverage and GC "
"content is <b>{0:.3g}</b> with a window size of {1}bp.</p>\n"
"{2}\n"
"<p>Note: the correlation coefficient has to be between -1.0 "
"and 1.0. A coefficient of 0 means no correlation, while a "
"coefficient of -1 or 1 means an existing "
"correlation between GC and Coverage</p>".format(corr, self.chromosome.bed.gc_window_size, image),
}
)
[docs] def normalized_coverage(self):
"""Barplot of normalized coverage section."""
if self.chromosome._mode == "chunks":
self._add_large_data_section("Normalised coverage", "normalised_coverage")
return
image = self.create_embedded_png(self.chromosome.plot_hist_normalized_coverage, input_arg="filename")
self.sections.append(
{
"name": "Normalised coverage",
"anchor": "normalised_coverage",
"content": "<p>Distribution of the normalised coverage with predicted "
"Gaussian. The red line should be followed the trend of the "
"barplot.</p>\n{0}".format(image),
}
)
[docs] def zscore_distribution(self):
"""Barplot of zscore distribution section."""
if self.chromosome._mode == "chunks":
self._add_large_data_section("Z-Score distribution", "zscore_distribution")
return
image = self.create_embedded_png(self.chromosome.plot_hist_zscore, input_arg="filename")
self.sections.append(
{
"name": "Z-Score distribution",
"anchor": "zscore_distribution",
"content": "<p>Distribution of the z-score (normalised coverage); You "
"should see a Gaussian distribution centered around 0. The "
"estimated parameters are mu={0:.2f} and sigma={1:.2f}.</p>\n"
"{2}".format(
self.chromosome.best_gaussian["mu"],
self.chromosome.best_gaussian["sigma"],
image,
),
}
)
class SubCoverageModule(SequanaBaseModule):
"""Write HTML report of subsection of chromosome with a javascript
coverage plot.
"""
def __init__(self, chromosome, rois, combobox, datatable, start, stop, directory):
super().__init__()
if directory == ".":
self.path = "../"
else:
self.path = "../../"
self.chromosome = chromosome
self.rois = rois
self.combobox = combobox
self.datatable = datatable
self.start = start
self.stop = stop
self.title = "Coverage analysis of chromosome {0}<br>" "positions {1} and {2}".format(
self.chromosome.chrom_name, start, stop
)
self.create_report_content()
self.create_html(
"{0}{4}subplots{4}{1}_{2}_{3}.html".format(directory, self.chromosome.chrom_name, start, stop, os.sep)
)
def create_report_content(self):
"""Generate the sections list to fill the HTML report."""
self.sections = list()
self.canvasjs_line_plot()
self.regions_of_interest()
def canvasjs_line_plot(self):
"""Create the CanvasJS line plot section."""
# set column of interest and create the csv
x_col = "pos"
y_col = ("cov", "mapq0", "gc")
columns = self.chromosome.df.columns
y_col = [n for n in y_col if n in columns]
csv = self.chromosome.to_csv(
start=self.start,
stop=self.stop,
columns=[x_col] + y_col,
index=False,
float_format="%.3g",
)
# create CanvasJS stuff
cjs = CanvasJSLineGraph(csv, "cov", x_col, y_col)
# set options
cjs.set_options({"zoomEnabled": "true", "zoomType": "x", "exportEnabled": "true"})
# set title
cjs.set_title("Genome Coverage")
# set legend
cjs.set_legend(
{
"verticalAlign": "bottom",
"horizontalAlign": "center",
"cursor": "pointer",
},
hide_on_click=True,
)
# set axis
cjs.set_axis_x(
{
"title": "Position (bp)",
"labelAngle": 30,
"minimum": self.start,
"maximum": self.stop,
}
)
cjs.set_axis_y({"title": "Coverage (Count)"})
cjs.set_axis_y2(
{
"title": "GC content (ratio)",
"minimum": 0,
"maximum": 1,
"lineColor": "#FFC425",
"titleFontColor": "#FFC425",
"labelFontColor": "#FFC425",
}
)
# set datas
cjs.set_data(
index=0,
data_dict={
"type": "line",
"name": "Coverage",
"showInLegend": "true",
"color": "#5BC0DE",
"lineColor": "#5BC0DE",
},
)
try:
i = y_col.index("mapq0")
cjs.set_data(
index=i,
data_dict={
"type": "line",
"name": "Coverage 2",
"showInLegend": "true",
"color": "#D9534F",
"lineColor": "#D9534F",
},
)
except ValueError:
pass
try:
i = y_col.index("gc")
cjs.set_data(
index=i,
data_dict={
"type": "line",
"axisYType": "secondary",
"name": "GC content",
"showInLegend": "true",
"color": "#FFC425",
"lineColor": "#FFC425",
},
)
except ValueError:
pass
# create canvasJS
html_cjs = cjs.create_canvasjs()
self.sections.append(
{
"name": "Interactive coverage plot",
"anchor": "iplot",
"content": ("{0}{1}\n".format(self.combobox, html_cjs)),
}
)
def regions_of_interest(self):
"""Region of interest section."""
subseq = [self.start, self.stop]
low_roi = self.rois.get_low_rois(subseq)
high_roi = self.rois.get_high_rois(subseq)
js = self.datatable.create_javascript_function()
lroi = DataTable(low_roi, "lroi", self.datatable)
hroi = DataTable(high_roi, "hroi", self.datatable)
html_low_roi = lroi.create_datatable(float_format="%.3g")
html_high_roi = hroi.create_datatable(float_format="%.3g")
roi_paragraph = (
"<p>Regions with a z-score {0}er than {1:.2f} and at "
"least one base with a z-score {0}er than {2:.2f} are detected as "
"{0} coverage region. Thus, there are {3} {0} coverage regions "
"between the position {4} and the position {5}</p>"
)
low_paragraph = roi_paragraph.format(
"low",
self.chromosome.thresholds.low2,
self.chromosome.thresholds.low,
len(low_roi),
self.start,
self.stop,
)
high_paragraph = roi_paragraph.format(
"high",
self.chromosome.thresholds.high2,
self.chromosome.thresholds.high,
len(high_roi),
self.start,
self.stop,
)
self.sections.append(
{
"name": "Regions Of Interest (ROI)",
"anchor": "roi",
"content": "{4}\n"
"<p>Running median is the median computed along the genome "
"using a sliding window. The following tables give regions of "
"interest detected by sequana. Here are some definition of the "
"table's columns:</p>\n"
"<ul><li><b>mean_cov</b>: the average of coverage</li>\n"
"<li><b>mean_rm</b>: the average of running median</li>\n"
"<li><b>mean_zscore</b>: the average of zscore</li>\n"
"<li><b>max_zscore</b>: the higher zscore contains in the "
"region</li>\n"
"<li><b>log2_ratio</b>:log2(mean_cov/mean_rm)</li></ul>\n"
"<h3>Low coverage region</h3>\n{0}\n{1}\n"
"<h3>High coverage region</h3>\n{2}\n{3}\n".format(
low_paragraph, html_low_roi, high_paragraph, html_high_roi, js
),
}
)