Source code for sequana.macs3

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2023 - Sequana Development Team
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################

import colorlog

from sequana.lazy import pandas as pd
from sequana.lazy import pylab
from sequana.utils.pandas import PandasReader
from sequana.viz.venn import plot_venn

logger = colorlog.getLogger(__name__)


__all__ = ["MACS3Reader", "PeakConsensus"]


[docs]class MACS3Reader:
    """This class reads output of macs3 tool

    ::

        from sequana import MACS3Reader
        mr = MACS3Reader(data)
        mr.df
        mr.plot_volcano()


    If input file ends in .xls, we assume this is a NAME_peaks.xls file. It is
    therefore a tabulated file where columns are:

    - chromosome name
    - start position of peak
    - end position of peak
    - length of peak region
    - absolute peak summit position
    - pileup height at peak summit
    - -log10(pvalue) for the peak summit (e.g. pvalue =1e-10, then this value should be 10)
    - fold enrichment for this peak summit against random Poisson distribution with local lambda,
    - -log10(qvalue) at peak summit

    Note that coordinates in XLS is 1-based which is different from BED format
    that follows. When --broad is enabled for broad peak calling, the pileup, p-value,
    q-value, and fold change in the XLS file will be the mean value across
    the entire peak region, since peak summit won't be called in broad peak calling mode.

    NAME_peaks.narrowPeak is BED6+4 format file which contains the peak locations
    together with peak summit, p-value, and q-value.

    The fifth column is the **score** calculate as int(-10*log10_pvalue) or
    int(-10*log10_pvalue) where log10-pvalue/log10-qvalue is the 8th or 9th column
    depending on whether -p (pvalue) or -q (qvalue) is used as score cutoff. The 7th
    columns stores the fold-change at peak summit. The 8th is -log10pvalue at peak summit
    and the 9th is -log10qvalue at peak summit. The 10th is the relative summit position
    to peak start equivalent to absolute peak summit position in the xls file.

    NAME_peaks.broadPeak is in BED6+3 format which is similar to narrowPeak file, except
    for missing the 10th column for annotating peak summits. This file and the
    gappedPeak file will only be available when --broad is enabled. Since in the
    broad peak calling mode, the peak summit won't be called, the values in the 5th,
    and 7-9th columns are the mean value across all positions in the peak region.
    Refer to narrowPeak if you want to fix the value issue in the 5th column

    .. reference:: https://github.com/macs3-project/MACS/blob/master/docs/callpeak.md
    """

    def __init__(self, filename):
        # .xls is not stuctured similarly to .narrowPeak
        if filename.endswith(".xls"):
            #
            self.df = PandasReader(filename, comment="#", sep="\t").df
            # narrow and broad files have this header in common and can be checked.
            for x in [
                "chr",
                "start",
                "end",
                "abs_summit",
                "-log10(pvalue)",
                "fold_enrichment",
                "-log10(qvalue)",
                "name",
            ]:
                # we could have also pileup depending on the user's options
                assert x in self.df.columns
            self.df["stop"] = self.df["end"]
        elif filename.endswith("narrowPeak"):
            columns = [
                "chr",
                "start",
                "stop",
                "name",
                "score",
                "NA",
                "fold_enrichment",
                "-log10(pvalue)",
                "-log10(qvalue)",
                "abs_summit_from_start",
            ]
            self.df = PandasReader(filename, header=None, sep="\t", columns=columns).df
        elif filename.endswith("broadPeak"):
            columns = [
                "chr",
                "start",
                "stop",
                "name",
                "score",
                "NA",
                "fold_enrichment",
                "-log10(pvalue)",
                "-log10(qvalue)",
            ]
            self.df = PandasReader(filename, header=None, sep="\t", columns=columns).df
        self.df["length"] = self.df["stop"] - self.df["start"]
        #

    def __len__(self):
        return len(self.df)

[docs]    def plot_volcano(self, plotly=False, marker_color="b", title=""):
        """Plots -log10 p-value versus fold change of all peaks"""

        if plotly:
            from plotly import express as px

            df = self.df.copy()
            df["log_adj_pvalue"] = self.df["-log10(pvalue)"]
            df["log2FoldChange"] = pylab.log2(self.df["fold_enrichment"])
            df["info"] = df["chr"] + ":" + df["start"].astype(str) + "-" + df["stop"].astype(str)

            fig = px.scatter(
                df,
                x="log2FoldChange",
                y="log_adj_pvalue",
                hover_name="info",
                log_y=False,
                color="length",
                height=600,
                title=title,
                labels={"log2_fold_enrichment": "log10 p-value"},
            )
            return fig

        pylab.scatter(
            pylab.log2(self.df["fold_enrichment"]),
            self.df["-log10(pvalue)"],
            marker="o",
            alpha=0.5,
            color=marker_color,
            lw=0,
            s=self.df["length"] / self.df["length"].max() * 400,
        )
        pylab.xlabel("Fold enrichment")
        pylab.ylabel("log10 pvalue")
        pylab.title(title)


[docs]class PeakConsensus:
    def __init__(self, f1, f2):
        self.df1 = MACS3Reader(f1).df
        self.df1["category"] = "first"
        self.df2 = MACS3Reader(f2).df
        self.df2["category"] = "second"
        self.df_merged = self.merge()

[docs]    def merge(self, overlap=0.2):
        df = pd.concat([self.df1, self.df2]).sort_values(["chr", "start"])
        # if overlap at least one base, we merge the peaks and label them with
        # common information, otherwise we report the original peak

        merged = []
        prev = None
        N1 = 0
        N2 = 0
        N12 = 0
        skip_next = True
        for k, current in df.iterrows():
            if skip_next:
                prev = current
                skip_next = False
                continue

            # if current overlaps the prev start or end, there is overlap
            # or if current included in prev there current and prev overlaps
            if current["start"] <= prev["start"] and current["stop"] >= prev["start"]:
                overlap = True
                N12 += 1
            elif current["start"] <= prev["stop"] and current["stop"] >= prev["stop"]:
                overlap = True
                N12 += 1
            elif current["start"] >= prev["start"] and current["stop"] <= prev["stop"]:
                overlap = True
                N12 += 1
            else:
                overlap = False
                if prev["name"].startswith("1_vs_6_7"):
                    N1 += 1
                elif prev["name"].startswith("2_vs_6_7"):
                    N2 += 1

            if overlap:
                m = min(current["start"], prev["start"])
                M = max(current["stop"], prev["stop"])
                data = current.copy()
                data["start"] = m
                data["stop"] = M
                data["category"] = "both"
                merged.append(data)
                skip_next = True
            else:
                m = min(current["start"], prev["start"])
                M = max(current["stop"], prev["stop"])
                merged.append(prev)
                skip_next = False

            prev = current
        df = pd.DataFrame(merged)
        df = df.reset_index(drop=True)
        return df

[docs]    def plot_venn(self, title="", labels=[]):
        plot_venn(
            (
                set(self.df_merged.query("category in ['first', 'both']").index),
                set(self.df_merged.query("category in ['second', 'both']").index),
            ),
            labels=(("first", "second")),
        )

[docs]    def to_saf(self, filename):
        """For now, all strand are categorised as strand + .
        strand not used in the pipeline for now."""
        df = self.df_merged.reset_index()
        df["strand"] = "+"
        df["GeneID"] = df["index"]
        df[["GeneID", "chr", "start", "stop", "strand", "category"]].to_csv(filename, sep="\t", index=None)

[docs]    def to_bed(self, filename):
        # output to be used by homer
        # annotatePeaks.pl file.bed file.fa -gid -gff file.gff
        # GeneID seems to have to start with Interval_ ??

        df = self.df_merged.reset_index()
        df["strand"] = "+"
        df["GeneID"] = ["Interval_{}".format(x) for x in df["index"]]
        mapper = {"both": 3, "first": 1, "second": 2}
        df["score"] = [mapper[x] for x in df["category"]]
        df[["chr", "start", "stop", "GeneID", "score", "strand"]].to_csv(filename, sep="\t", index=None, header=False)