Multi dimensional Clustering Visualization

Posted by MikeFairbrother on Fri, 22 Oct 2021 14:02:08 +0200

preface

Some time ago, various samples were clustered, and it was found that there were many dimensions, which were difficult to express clearly in binary graphics visualization. Therefore, some commonly used visualization methods are summarized, that is to reduce the dimension of data and depict the characteristics of each sample in a two-dimensional graph.

case

Background:
A game company collects the behavior data and attributes of each player. After processing, the data set is as follows:

toy_id is the unique identification id of each player user, and a-l is the characteristic information processed to represent the player's attributes.
We use statistical chart analysis + unsupervised learning to divide the N players into x clusters.

By the way, share the code of DBSCN model and k-protocols to facilitate clustering in the future 😄
*Note that continuous variables should eliminate dimensional effects

k-prototypes:
Continuous variable + classified variable (Euclidean distance + Hamming distance)

"""
k-prototypes
"""
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.cluster import InterclusterDistance
from yellowbrick.model_selection import LearningCurve
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("notebook")
import numpy as np
import pandas as pd
from kmodes.kprototypes import KPrototypes
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns


"""
Normalization is used to deal with problems with different dimensions. This case is not applicable
"""
def minmax(calc_case):
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(calc_case)
    X_scaled = pd.DataFrame(x_scaled,columns=calc_case.columns)
    return X_scaled

def data_clean(data, key=None, strlist=None, numlist=None):
    data = data.fillna(0)
    for s in strlist:
        data[s] = data[s].astype(str)
    temp = data[[key]+strlist]
    temp2 = data[numlist]
    clean = pd.concat([temp,minmax(temp2)],axis=1)
    return clean


def elbow_method(X,k=None,categorical=None):
    X2 = X[X.columns.values[1:]]
    X_matrix = X2.values
    cost = []
    for num_clusters in list(range(1,k)):
        kproto = KPrototypes(n_clusters=num_clusters, init='Cao')
        kproto.fit_predict(X_matrix, categorical=categorical)
        tl = []
        tl.append(num_clusters)
        tl.append(kproto.cost_)
        cost.append(tl)
    cost = pd.DataFrame(cost,columns=['num_clusters','MSE'])
    beauty_plot(cost, 'MSE', date='num_clusters', marker='*',name='elbow_method-MSE',color=['navy'])
#     pd.DataFrame(cost)

def beauty_plot(para, *value, date=None, marker=None,name=None,color=None):
    plt.figure(figsize=(16,9))
#     plt.ylim(0,2000000)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    #y-axis canceling scientific counting method
    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)
    x_formatter.set_scientific(False)
    plt.gca().yaxis.set_major_formatter(x_formatter)
    temp = para.sort_values(by=date)
    x = temp[date]
    y1 = temp[value[0]]
#     y2 = temp[value[1]]
#     y3 = temp[value[2]]
#     y2 = temp[value[1]]
    plt.xlabel(date)
    plt.ylabel(name)
    plt.title(name)
    plt.xticks(rotation=45)
#     plt.annotate('max', xy=(a, b), xytext=(a+relativedelta(days=2), b*1.1),arrowprops=dict(facecolor='black', shrink=0.05))
    # plt.plot(x_pron, y_porn, color="y", linestyle="--", marker="^", linewidth=1.0)
    plt.plot(x, y1, color=color[0], linestyle="--", marker=marker, linewidth=2.0,label=value[0])
#     plt.plot(x, y2, color=color[1], linestyle="--", marker=marker, linewidth=2.0,label=value[1])
#     plt.plot(x, y3, color=color[2], linestyle="--", marker=marker, linewidth=2.0,label=value[2])
#     plt.plot(x, y2, color='navy', linestyle="--", marker=marker, linewidth=1.0,label=value[1])
#     plt.plot(x, y3, color='navy', linestyle="--", marker=marker, linewidth=1.0,label=value[2])
    plt.legend() # Show Legend 
    plt.grid(color="k", linestyle=":")
    plt.show()

def run_k_prototypes(X, k=None,categorical=None,key=None,strlist=None,numlist=None):
    X2 = data_clean(X, key=key,strlist=strlist,numlist=numlist)
    X2 = X2[X2.columns.values[1:]]
    X_matrix = X2.values
    kproto = KPrototypes(n_clusters=k, init='Cao')
    clusters = kproto.fit_predict(X_matrix, categorical=categorical)
    print('====== Centriods ======')
    print(kproto.cluster_centroids_)
    print('====== Cost ======')
    print(kproto.cost_)
    X['cluster'] = clusters
    # We can observe the clustering effect from the above figure, but it will be very troublesome to observe when there is a large amount of data or many indicators.
#     from sklearn import metrics  
#     # The following function can calculate the contour coefficient (sklearn is really a powerful package)
#     score = metrics.silhouette_score(X_matrix,clusters,metric="precomputed")
#     print('====== Silhouette Coefficient ======')
#     print(score)
    return X

if __name__ == "__main___":
	data_clean(test_kpro,key='toy_id',strlist=['c','e','f','h'],numlist=['a','b','d','g','i','j','k','l'])
    elbow_method(test_kpro2,k=20,categorical=[0,1,2,3,4])
    test_kpro3 = run_k_prototypes(test_kpro, k=10,categorical=[0,1,2,3,4],key='toy_id',strlist=['c','e','f','h'],numlist=['a','b','d','g','i','j','k','l'])

DBSCAN:
Density-based spatial clustering of applications with noise

"""
DBSCAN
"""
from sklearn import datasets
import numpy as np
import random
import matplotlib.pyplot as plt
import time
import copy
 

def find_neighbor(j, x, eps):
    N = list()
    for i in range(x.shape[0]):
        temp = np.sqrt(np.sum(np.square(x[j]-x[i])))  # Calculate Euclidean distance
        if temp <= eps:
            N.append(i)
    return set(N)
 

def DBSCAN(X, eps, min_Pts):
    k = -1
    neighbor_list = []  # The neighborhood used to store each data
    omega_list = []  # Core object collection
    gama = set([x for x in range(len(X))])  # Initially mark all points as not accessed
    cluster = [-1 for _ in range(len(X))]  # clustering
    for i in range(len(X)):
        neighbor_list.append(find_neighbor(i, X, eps))
        if len(neighbor_list[-1]) >= min_Pts:
            omega_list.append(i)  # Add samples to the core object collection
    omega_list = set(omega_list)  # Convert to set for easy operation
    while len(omega_list) > 0:
        gama_old = copy.deepcopy(gama)
        j = random.choice(list(omega_list))  # Select a core object at random
        k = k + 1
        Q = list()
        Q.append(j)
        gama.remove(j)
        while len(Q) > 0:
            q = Q[0]
            Q.remove(q)
            if len(neighbor_list[q]) >= min_Pts:
                delta = neighbor_list[q] & gama
                deltalist = list(delta)
                for i in range(len(delta)):
                    Q.append(deltalist[i])
                    gama = gama - delta
        Ck = gama_old - gama
        Cklist = list(Ck)
        for i in range(len(Ck)):
            cluster[Cklist[i]] = k
        omega_list = omega_list - Ck
    return cluster

if __name__ == "__main___":
	eps = 0.1
	min_Pts = 10
	C = DBSCAN(X, eps, min_Pts)
	test['cluster'] = C
	"""
	Contour coefficient sihouette
	"""
	from sklearn import metrics  
	score = metrics.silhouette_score(test2.iloc[:,1:-1],test2.iloc[:,-1]) 
	print(score)
	 

Data set after clustering:

Traditional 2D \ 3D

Explicit variable array x, Y or X, Y, Z. We can draw the actual situation.

Adjust it according to specific needs~

Two dimensional diagram

Example:

def plotshow2(para, *value, date=None, marker=None,name=None,color=None):
    plt.figure(figsize=(22,10))
#     plt.ylim(0,2000000)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    #y-axis canceling scientific counting method
    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)
    x_formatter.set_scientific(False)
    plt.gca().yaxis.set_major_formatter(x_formatter)
    temp = para.sort_values(by=date)
    x = temp[date]
    y1 = temp[value[0]]
    y2 = temp[value[1]]
#     y2 = temp[value[1]]
    plt.xlabel(date)
    plt.ylabel(name)
    plt.title(name)
    plt.xticks(rotation=45)
#     plt.annotate('max', xy=(a, b), xytext=(a+relativedelta(days=2), b*1.1),arrowprops=dict(facecolor='black', shrink=0.05))
    # plt.plot(x_pron, y_porn, color="y", linestyle="--", marker="^", linewidth=1.0)
    plt.plot(x, y1, color=color[0], linestyle="--", marker=marker, linewidth=2.0,label=value[0])
    plt.plot(x, y2, color=color[1], linestyle="--", marker=marker, linewidth=2.0,label=value[1])
    plt.legend() # Show Legend 
    plt.grid(color="k", linestyle=":")
    plt.show()

Three dimensional drawing

Example:

"""
3D space map
"""
data_int = data.copy()
for item in list(data_int.columns)[1:]:
    data_int[item] = data_int[item].apply(lambda x: round(x))
    
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

data = np.random.randint(0, 255, size=[40, 40, 40])

x, y, z = np.array(data_int['aaaa']), np.array(data_int['bbb']), np.array(data_int['cccc'])
ax = plt.subplot(111, projection='3d')  # Create a 3D drawing project
#  Divide the data points into three parts and draw them with color discrimination
ax.scatter(x[:177], y[:177], z[:177], c='y')  # Draw data points
ax.scatter(x[177:354], y[177:354], z[177:354], c='r')
ax.scatter(x[354:532], y[354:532], z[354:532], c='g')

ax.set_zlabel('aaaa')  # Coordinate axis
ax.set_ylabel('bbb')
ax.set_xlabel('cccc')
plt.show()

N-dimensional graph

In fact, in a word, we need to take out the label alone and reduce the dimension of all the remaining features, so that we can see it with the naked eye... (poor three-dimensional creature o(﹏╥) O ~ ~)
Here are some common dimensionality reduction methods:

pairplot

pairplot is very easy to use, especially in machine learning. It is often used to observe the distribution of features. At the same time, you can also set the clustering results through the hue parameter. Very intuitive ~!

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.pairplot(pair, hue='cluster',size=2, kind='reg')

PCA principal component analysis

The original features are transformed into linear independent features by orthogonal transformation, and the transformed features are called main components. Principal component analysis can reduce the original dimension to n dimensions. One special case is to reduce the dimension to 2 dimensions through principal component analysis. In this way, multidimensional data can be transformed into points in the plane to achieve the purpose of multidimensional data visualization.

"""
Principal component analysis( PCA)
"""
from sklearn import decomposition

pca = decomposition.PCA(n_components=2)

X = pca.fit_transform(test2.iloc[:,1:-1].values)

pos=pd.DataFrame()
pos['X'] =X[:, 0]
pos['Y'] =X[:, 1]
pos['cluster'] = test2['cluster']

ax = pos[pos['cluster']==-1].plot(kind='scatter', x='X', y='Y', color='blue', label='-1 cluster')
pos[pos['cluster']==0].plot(kind='scatter', x='X', y='Y', color='green', label='0 cluster', ax=ax)
pos[pos['cluster']==1].plot(kind='scatter', x='X', y='Y', color='red', label='1 cluster', ax=ax)
pos[pos['cluster']==2].plot(kind='scatter', x='X', y='Y', color='orange', label='2 cluster', ax=ax)

Multi dimensional scaling (MDS)

The multidimensional scale attempts to find a good low dimensional representation of the distance of the original high-dimensional spatial data. In short, the multidimensional scale is used for the similarity of data. It attempts to use the distance in geometric space to model the similarity of data. Frankly, it is to use the distance in two-dimensional space to represent the relationship in high-dimensional space. Data can be the similarity between objects and the interaction between molecules Frequency or inter country transaction index. This is different from the previous method. The input of the previous method is raw data, while in the example of multi-dimensional ruler, the input is a distance matrix based on Euclidean distance. The multi-dimensional ruler algorithm is a continuous iterative process. Therefore, max_iter needs to be used to specify the maximum number of iterations, and the calculation time is also the above algorithm The largest one in.

"""
Multidimensional ruler( Multi-dimensional scaling, MDS)
"""
from sklearn import manifold

from sklearn.metrics import euclidean_distances

similarities = euclidean_distances(test2.iloc[:,1:-1].values)
# mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity="precomputed", n_jobs=1)
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, dissimilarity="euclidean", n_jobs=1)
X = mds.fit(similarities).embedding_

pos=pd.DataFrame(X, columns=['X', 'Y'])
pos['cluster'] = test2['cluster']

ax = pos[pos['cluster']==-1].plot(kind='scatter', x='X', y='Y', color='blue', label='-1 cluster')
pos[pos['cluster']==0].plot(kind='scatter', x='X', y='Y', color='green', label='0 cluster', ax=ax)
pos[pos['cluster']==1].plot(kind='scatter', x='X', y='Y', color='red', label='1 cluster', ax=ax)
pos[pos['cluster']==2].plot(kind='scatter', x='X', y='Y', color='orange', label='2 cluster', ax=ax)

TSNE(t-distributed Stochastic Neighbor Embedding)

t-SNE(t-distribution random neighborhood embedding) It is a nonlinear dimensionality reduction algorithm for exploring high-dimensional data. It finds patterns in data by identifying observed clusters based on the similarity of data points with multiple features, and maps multidimensional data to two or more dimensions suitable for human observation. It is essentially a dimensionality reduction visualization technology. The best way to use this algorithm is to use it for exploratory data analysis .

"""
TSNE(t-distributed Stochastic Neighbor Embedding
"""
from sklearn.manifold import TSNE

iris_embedded = TSNE(n_components=2).fit_transform(test2.iloc[:,1:-1].values)

pos = pd.DataFrame(iris_embedded, columns=['X','Y'])
pos['cluster'] = test2['cluster']

ax = pos[pos['cluster']==-1].plot(kind='scatter', x='X', y='Y', color='blue', label='-1 cluster')
pos[pos['cluster']==0].plot(kind='scatter', x='X', y='Y', color='green', label='0 cluster', ax=ax)
pos[pos['cluster']==1].plot(kind='scatter', x='X', y='Y', color='red', label='1 cluster', ax=ax)
pos[pos['cluster']==2].plot(kind='scatter', x='X', y='Y', color='orange', label='2 cluster', ax=ax)

Topics: Python Visualization