Source code for kmeans

"""
Elastic Functional Clustering

moduleauthor:: J. Derek Tucker <jdtuck@sandia.gov>

"""

import numpy as np
import fdasrsf.utility_functions as uf
from scipy.integrate import trapezoid
from numpy.linalg import norm
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors



[docs]
def kmeans_align(
    f,
    time,
    K,
    seeds=None,
    lam=0,
    showplot=True,
    smooth_data=False,
    parallel=False,
    alignment=True,
    omethod="DP2",
    MaxItr=50,
    thresh=0.01,
):
    """
    This function clusters functions and aligns using the elastic square-root
    slope (srsf) framework.

    :param f: numpy ndarray of shape (M,N) of N functions with M samples
    :param time: vector of size M describing the sample points
    :param K: number of clusters
    :param seeds: indexes of cluster center functions (default = None)
    :param lam: controls the elasticity (default = 0)
    :param showplot: shows plots of functions (default = T)
    :param smooth_data: smooth data using box filter (default = F)
    :param parallel: enable parallel mode using \code{\link{joblib}} and \code{doParallel} package (default=F)
    :param alignment: whether to perform alignment (default = T)
    :param omethod: optimization method (DP,DP2,RBFGS)
    :param MaxItr: maximum number of iterations
    :param thresh: cost function threshold
    :type f: np.ndarray
    :type time: np.ndarray

    :rtype: dictionary
    :return fn: aligned functions - matrix (N x M) of M functions with N samples which is a list for each cluster
    :return qn: aligned SRSFs - similar structure to fn
    :return q0: original SRSFs
    :return labels: cluster labels
    :return templates: cluster center functions
    :return templates_q: cluster center SRSFs
    :return gam: warping functions - similar structure to fn
    :return qun: Cost Function

    """

    w = 0.0
    k = 1
    cores = -1
    eps = np.finfo(np.double).eps

    M = f.shape[0]
    N = f.shape[1]

    if seeds is None:
        a = np.arange(0, N, dtype=int)
        template_ind = np.random.choice(a, K)
    else:
        template_ind = seeds

    templates = np.zeros((M, K))
    for i in range(K):
        templates[:, i] = f[:, template_ind[i]]

    cluster_id = np.zeros(N, dtype=int)
    qun = np.zeros(MaxItr)

    # convert to SRSF
    f, g, g2 = uf.gradient_spline(time, f, smooth_data)
    q = g / np.sqrt(abs(g) + eps)
    templates_q = np.zeros((M, K))
    for i in range(K):
        templates_q[:, i] = q[:, template_ind[i]]

    for itr in range(0, MaxItr):
        print("updating step: r=%d" % (itr + 1))

        # Alignment
        gam = {}
        Dy = np.zeros((K, N))
        qn = {}
        fn = {}

        for k in range(K):
            gam_tmp = np.zeros((M, N))
            if alignment:
                if parallel:
                    out = Parallel(n_jobs=cores)(
                        delayed(uf.optimum_reparam)(
                            templates_q[:, k], time, q[:, n], omethod, lam
                        )
                        for n in range(N)
                    )
                    gam_tmp = np.array(out)
                    gam_tmp = gam_tmp.transpose()
                else:
                    for n in range(0, N):
                        gam_tmp[:, n] = uf.optimum_reparam(
                            templates_q[:, k], time, q[:, n], omethod, lam
                        )
            else:
                for n in range(0, N):
                    gam_tmp[:, k] = np.linspace(0, 1, M)

            fw = np.zeros((M, N))
            qw = np.zeros((M, N))
            dist = np.zeros(N)
            for i in range(0, N):
                fw[:, i] = uf.warp_f_gamma(time, f[:, i], gam_tmp[:, i])
                qw[:, i] = uf.f_to_srsf(fw[:, i], time)
                dist[i] = np.sqrt(trapezoid((qw[:, i] - templates_q[:, k]) ** 2, time))

            Dy[k, :] = dist
            qn[k] = qw
            fn[k] = fw
            gam[k] = gam_tmp

        # Assignment
        cluster_id = Dy.argmin(axis=0)

        # Normalization
        for k in range(K):
            idx = np.where(cluster_id == k)[0]
            ftmp = fn[k][:, idx]
            gamtmp = gam[k][:, idx]
            gamI = uf.SqrtMeanInverse(gamtmp)
            N1 = idx.shape[0]

            gamt = np.zeros((M, N1))
            f_temp = np.zeros((M, N1))
            q_temp = np.zeros((M, N1))
            if parallel:
                out = Parallel(n_jobs=cores)(
                    delayed(norm_sub)(ftmp[:, i], time, gamtmp[:, i], gamI)
                    for i in range(N1)
                )
                for i in range(0, N1):
                    f_temp[:, i] = out[i][0]
                    q_temp[:, i] = out[i][1]
                    gamt[:, i] = out[i][2]
            else:
                for i in range(N1):
                    f_temp[:, i], q_temp[:, i], gamt[:, i] = norm_sub(
                        ftmp[:, i], time, gamtmp[:, i], gamI
                    )

            qn[k][:, idx] = q_temp
            fn[k][:, idx] = f_temp
            gam[k][:, idx] = gamt

        # Template Identification
        qun_t = np.zeros(K)
        old_templates_q = templates_q.copy()
        for k in range(K):
            idx = np.where(cluster_id == k)[0]
            templates_q[:, k] = qn[k][:, idx].mean(axis=1)
            templates[:, k] = fn[k][:, idx].mean(axis=1)

            qun_t[k] = norm(templates_q[:, k] - old_templates_q[:, k]) / norm(
                old_templates_q[:, k]
            )

        qun[itr] = qun_t.mean()

        if qun[itr] < thresh:
            break

    # Output
    ftmp = {}
    qtmp = {}
    gamtmp = {}
    for k in range(K):
        idx = np.where(cluster_id == k)[0]
        ftmp[k] = fn[k][:, idx]
        qtmp[k] = qn[k][:, idx]
        gamtmp[k] = gam[k][:, idx]

    out = {}
    out["f0"] = f
    out["q0"] = q
    out["time"] = time
    out["fn"] = ftmp
    out["qn"] = qtmp
    out["gam"] = gamtmp
    out["labels"] = cluster_id
    out["templates"] = templates
    out["templates_q"] = templates_q
    out["lambda"] = lam
    out["omethod"] = omethod
    out["qun"] = qun[0:itr]

    if showplot:
        num_plot = int(np.ceil(K / 6))
        a = mcolors.TABLEAU_COLORS
        colors = list(a.keys())
        plt.figure()
        plt.plot(time, f)
        plt.title("Original Data")

        plt.figure()
        plt.plot(time, templates)
        plt.title("Cluster Mean Functions")

        for k in range(num_plot):
            cnt = 1
            plt.figure()
            for n in np.arange(k * 6, min(K, (k + 1) * 6), dtype=int):
                ax = plt.subplot(2, 3, cnt)
                ax.plot(time, ftmp[n], color="lightgrey")
                ax.plot(time, templates[:, n], color=colors[cnt - 1])
                ax.set_title("Cluster f: %d" % n)
                cnt += 1

        for k in range(num_plot):
            cnt = 1
            plt.figure()
            for n in np.arange(k * 6, min(K, (k + 1) * 6), dtype=int):
                ax = plt.subplot(2, 3, cnt)
                ax.plot(time, qtmp[n], color="lightgrey")
                ax.plot(time, templates_q[:, n], color=colors[cnt - 1])
                ax.set_title("Cluster q: %d" % n)
                cnt += 1

        plt.show()

    return out



def norm_sub(f, time, gam, gamI):
    fw = uf.warp_f_gamma(time, f, gamI)
    qw = uf.f_to_srsf(fw, time)
    time0 = (time[-1] - time[0]) * gamI + time[0]
    gamw = np.interp(time0, time, gam)

    return (fw, qw, gamw)