Source code for kmeans

"""
Elastic Functional Clustering

moduleauthor:: J. Derek Tucker <jdtuck@sandia.gov>

"""

import numpy as np
import fdasrsf.utility_functions as uf
from scipy.integrate import trapz
from numpy.linalg import norm
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors


[docs]def kmeans_align(f, time, K, seeds=None, lam=0, showplot=True, smooth_data=False, parallel=False, alignment=True, omethod="DP2", MaxItr=50, thresh=0.01): """ This function clusters functions and aligns using the elastic square-root slope (srsf) framework. :param f: numpy ndarray of shape (M,N) of N functions with M samples :param time: vector of size M describing the sample points :param K number of clusters :param seeds indexes of cluster center functions (default = None) :param lam controls the elasticity (default = 0) :param showplot shows plots of functions (default = T) :param smooth_data smooth data using box filter (default = F) :param parallel enable parallel mode using \code{\link{joblib}} and \code{doParallel} package (default=F) :param alignment whether to perform alignment (default = T) :param omethod optimization method (DP,DP2,RBFGS) :param MaxItr maximum number of iterations :param thresh cost function threshold :type f: np.ndarray :type time: np.ndarray :rtype: dictionary :return fn: aligned functions - matrix (N x M) of M functions with N samples which is a list for each cluster :return qn: aligned SRSFs - similar structure to fn :return q0: original SRSFs :return labels: cluster labels :return templates: cluster center functions :return templates_q: cluster center SRSFs :return gam: warping functions - similar structure to fn :return qun: Cost Function """ w = 0.0 k = 1 cores = -1 eps = np.finfo(np.double).eps M = f.shape[0] N = f.shape[1] if seeds is None: a = np.arange(0, N, dtype=int) template_ind = np.random.choice(a, K) else: template_ind = seeds templates = np.zeros((M,K)) for i in range(K): templates[:,i] = f[:,template_ind[i]] cluster_id = np.zeros(N, dtype=int) qun = np.zeros(MaxItr) # convert to SRSF f, g, g2 = uf.gradient_spline(time, f, smooth_data) q = g / np.sqrt(abs(g) + eps) templates_q = np.zeros((M,K)) for i in range(K): templates_q[:,i] = q[:,template_ind[i]] for itr in range(0, MaxItr): print("updating step: r=%d" % (itr + 1)) # Alignment gam = {} Dy = np.zeros((K,N)) qn = {} fn = {} for k in range(K): gam_tmp = np.zeros((M,N)) if alignment: if parallel: out = Parallel(n_jobs=cores)(delayed(uf.optimum_reparam)(templates_q[:, k], time, q[:, n], omethod, lam) for n in range(N)) gam_tmp = np.array(out) gam_tmp = gam_tmp.transpose() else: for n in range(0,N): gam_tmp[:,n] = uf.optimum_reparam(templates_q[:, k], time, q[:, n], omethod, lam) else: for n in range(0,N): gam_tmp[:,k] = np.linspace(0,1,M) fw = np.zeros((M,N)) qw = np.zeros((M,N)) dist = np.zeros(N) for i in range(0, N): fw[:, i] = uf.warp_f_gamma(time, f[:,i], gam_tmp[:,i]) qw[:, i] = uf.f_to_srsf(fw[:, i], time) dist[i] = np.sqrt(trapz((qw[:, i] - templates_q[:, k]) ** 2, time)) Dy[k,:] = dist qn[k] = qw fn[k] = fw gam[k] = gam_tmp # Assignment cluster_id = Dy.argmin(axis=0) # Normalization for k in range(K): idx = np.where(cluster_id == k)[0] ftmp = fn[k][:,idx] gamtmp = gam[k][:,idx] gamI = uf.SqrtMeanInverse(gamtmp) N1 = idx.shape[0] gamt = np.zeros((M,N1)) f_temp = np.zeros((M,N1)) q_temp = np.zeros((M,N1)) if parallel: out = Parallel(n_jobs=cores)(delayed(norm_sub)(ftmp[:, i], time, gamtmp[:,i], gamI) for i in range(N1)) for i in range(0, N1): f_temp[:,i] = out[i][0] q_temp[:, i] = out[i][1] gamt[:, i] = out[i][2] else: for i in range(N1): f_temp[:,i], q_temp[:, i], gamt[:, i] = norm_sub(ftmp[:, i], time, gamtmp[:,i], gamI) qn[k][:,idx] = q_temp fn[k][:,idx] = f_temp gam[k][:,idx] = gamt # Template Identification qun_t = np.zeros(K) old_templates_q = templates_q.copy() for k in range(K): idx = np.where(cluster_id == k)[0] templates_q[:,k] = qn[k][:,idx].mean(axis=1) templates[:,k] = fn[k][:,idx].mean(axis=1) qun_t[k] = norm(templates_q[:,k] - old_templates_q[:,k])/norm(old_templates_q[:,k]) qun[itr] = qun_t.mean() if qun[itr] < thresh: break # Output ftmp = {} qtmp = {} gamtmp = {} for k in range(K): idx = np.where(cluster_id == k)[0] ftmp[k] = fn[k][:,idx] qtmp[k] = qn[k][:,idx] gamtmp[k] = gam[k][:,idx] out = {} out['f0'] = f out['q0'] = q out['time'] = time out['fn'] = ftmp out['qn'] = qtmp out['gam'] = gamtmp out['labels'] = cluster_id out['templates'] = templates out['templates_q'] = templates_q out['lambda'] = lam out['omethod'] = omethod out['qun'] = qun[0:itr] if showplot: num_plot = int(np.ceil(K/6)) a = mcolors.TABLEAU_COLORS colors = list(a.keys()) plt.figure() plt.plot(time, f) plt.title('Original Data') plt.figure() plt.plot(time, templates) plt.title('Cluster Mean Functions') for k in range(num_plot): cnt = 1 plt.figure() for n in np.arange(k*6,min(K,(k+1)*6),dtype=int): ax = plt.subplot(2, 3, cnt) ax.plot(time, ftmp[n], color='lightgrey') ax.plot(time, templates[:, n], color=colors[cnt-1]) ax.set_title('Cluster f: %d' % n) cnt += 1 for k in range(num_plot): cnt = 1 plt.figure() for n in np.arange(k*6,min(K,(k+1)*6),dtype=int): ax = plt.subplot(2, 3, cnt) ax.plot(time, qtmp[n], color='lightgrey') ax.plot(time, templates_q[:, n], color=colors[cnt-1]) ax.set_title('Cluster q: %d' % n) cnt += 1 plt.show() return out
def norm_sub(f,time,gam,gamI): fw = uf.warp_f_gamma(time, f, gamI) qw = uf.f_to_srsf(fw, time) time0 = (time[-1] - time[0]) * gamI + time[0] gamw = np.interp(time0, time, gam) return(fw, qw, gamw)