Source code for sequana.viz.pca

#
#  This file is part of Sequana software
#
#  Copyright (c) 2016-2021 - Sequana Development Team
#
#
#  Distributed under the terms of the 3-clause BSD license.
#  The full license is in the LICENSE file, distributed with this software.
#
#  website: https://github.com/sequana/sequana
#  documentation: http://sequana.readthedocs.io
#
##############################################################################

import colorlog
from adjustText import adjust_text

from sequana.lazy import numpy as np
from sequana.lazy import pylab
from sequana.viz import clusterisation

logger = colorlog.getLogger(__name__)


__all__ = ["PCA"]


[docs]class PCA(clusterisation.Cluster):
    """

    .. plot::
        :include-source:

        from sequana.viz.pca import PCA
        from sequana import sequana_data
        import pandas as pd

        data = sequana_data("test_pca.csv")
        df = pd.read_csv(data)
        df = df.set_index("Id")
        p = PCA(df, colors={
            "A1": 'r', "A2": 'r', 'A3': 'r',
            "B1": 'b', "B2": 'b', 'B3': 'b'})
        p.plot(n_components=2)


    From R, a PCA is selecting the first 500 features based on variance.


    """

    def __init__(self, data, colors={}):
        super(PCA, self).__init__(data, colors)

[docs]    def plot_pca_vs_max_features(self, step=100, n_components=2, transform="log"):
        """

        .. plot::
            :include-source:

            from sequana.viz.pca import PCA
            from sequana import sequana_data
            import pandas as pd

            data = sequana_data("test_pca.csv")
            df = pd.read_csv(data)
            df = df.set_index("Id")

            p = PCA(df)
            p.plot_pca_vs_max_features()

        """
        assert n_components in [2, 3, 4]
        N = len(self.df)
        if step > N:
            step = N

        # We start with at least 5 features
        X = range(10, N, step)
        Y = []
        for i, x in enumerate(X):
            res = self.plot(n_components=n_components, max_features=x, show_plot=False, transform=transform)
            Y.append(res)

        sub = n_components
        pylab.subplot(sub, 1, 1)
        pylab.plot(X, [y[0] * 100 for y in Y])
        pylab.ylabel("PC1 (%)")
        pylab.subplot(sub, 1, 2)
        pylab.plot(X, [y[1] * 100 for y in Y])
        pylab.ylabel("PC2 (%)")
        if sub >= 3:
            pylab.subplot(sub, 1, 3)
            pylab.plot(X, [y[2] * 100 for y in Y])
            pylab.ylabel("PC3 (%)")
        if sub >= 4:
            pylab.subplot(sub, 1, 4)
            pylab.plot(X, [y[3] * 100 for y in Y])
            pylab.ylabel("PC4 (%)")

[docs]    def plot(
        self,
        n_components=2,
        transform="log",
        switch_x=False,
        switch_y=False,
        switch_z=False,
        colors=None,
        max_features=500,
        show_plot=True,
        fontsize=10,
        adjust=True,
    ):
        """

        :param n_components: at number starting at 2 or a value below 1
            e.g. 0.95 means select automatically the number of components to
            capture 95% of the variance
        :param transform: can be 'log' or 'anscombe', log is just log10. count
            with zeros, are set to 1
        """
        from sklearn.decomposition import PCA

        pylab.clf()

        pca = PCA(n_components)

        # scale the data
        data = self.scale_data(transform_method=transform)

        # keep only top variable features
        tokeep = data.std(axis=1).sort_values(ascending=False).index[0:max_features]
        data = data.loc[tokeep]

        pca.fit(data.T)

        if transform == "standard":
            # If scale data used a scaler, then we need to use it for the transformation
            Xr = pca.transform(self.scaler.fit_transform(self.df.loc[tokeep].T))
        else:
            # otherwise, noting to do
            Xr = pca.transform(self.df.loc[tokeep].T)
        self.Xr = Xr

        if switch_x:
            Xr[:, 0] *= -1
        if switch_y:
            Xr[:, 1] *= -1
        if switch_z:
            Xr[:, 2] *= -1

        def adjust_func():
            texts = [x for x in pylab.gca().get_children() if "text.Annotation" in str(x.__class__)]
            adjust_text(texts)

        # PC1 vs PC2
        if show_plot:
            pylab.figure(1)
            self._plot(Xr, pca=pca, pc1=0, pc2=1, colors=colors, fontsize=fontsize)
            if adjust:
                adjust_func()

        if len(pca.explained_variance_ratio_) >= 3:
            if show_plot:
                pylab.figure(2)
                self._plot(Xr, pca=pca, pc1=0, pc2=2, colors=colors, fontsize=fontsize)
                if adjust:
                    adjust_func()

                pylab.figure(3)
                self._plot(Xr, pca=pca, pc1=1, pc2=2, colors=colors, fontsize=fontsize)
                if adjust:
                    adjust_func()

        return pca.explained_variance_ratio_