Wizard Book


Programming Python, Web, Android

Python: KMedoids Clustering

Proses Data Mining Clustering Metode K-Medoids menggunakan Python

import numpy as np
import random

#source: https://github.com/letiantian/kmedoids
#Bauckhage C. Numpy/scipy Recipes for Data Science: k-Medoids Clustering[R]. Technical Report, University of Bonn, 2015.

def kMedoids(D, k, tmax=100):
    # determine dimensions of distance matrix D
    m, n = D.shape

    if k > n:
        raise Exception('too many medoids')

    # find a set of valid initial cluster medoid indices since we
    # can't seed different clusters with two points at the same location
    valid_medoid_inds = set(range(n))
    invalid_medoid_inds = set([])
    rs,cs = np.where(D==0)
    # the rows, cols must be shuffled because we will keep the first duplicate below
    index_shuf = list(range(len(rs)))
    np.random.shuffle(index_shuf)
    rs = rs[index_shuf]
    cs = cs[index_shuf]
    for r,c in zip(rs,cs):
        # if there are two points with a distance of 0...
        # keep the first one for cluster init
        if r < c and r not in invalid_medoid_inds:
            invalid_medoid_inds.add(c)
    valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)

    if k > len(valid_medoid_inds):
        raise Exception('too many medoids (after removing {} duplicate points)'.format(
            len(invalid_medoid_inds)))

    # randomly initialize an array of k medoid indices
    M = np.array(valid_medoid_inds)
    np.random.shuffle(M)
    M = np.sort(M[:k])

    # create a copy of the array of medoid indices
    Mnew = np.copy(M)

    # initialize a dictionary to represent clusters
    C = {}
    for t in range(tmax):
        # determine clusters, i. e. arrays of data indices
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
        # update cluster medoids
        for kappa in range(k):
            J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
            j = np.argmin(J)
            Mnew[kappa] = C[kappa][j]
        np.sort(Mnew)
        # check for convergence
        if np.array_equal(M, Mnew):
            break
        M = np.copy(Mnew)
    else:
        # final update of cluster memberships
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]

    # return results
    return M, C

from sklearn.metrics.pairwise import pairwise_distances
import numpy as np

# 3 points in dataset
#data = np.array([[1,1], 
#                [2,2], 
#                [10,10]])

data = np.array([[2018,3,3,10000000,10,1000000,150000,3000000,7000000,58], 
                [2018,8,8,50000000,24,2083500,750000,5000400,4000000,61], 
                [2019,3,8,15000000,15,1000000,225000,3000000,12000000,58],
                [2019,1,9, 3500000,20,175000, 52500, 3325000,175000,58],
                [2018,7,7,35000000,36,972500,525000,12642500,22357500,59],
                [2017,6,4, 5000000,10, 500000, 75000,1000000, 4000000,60],
                [2020,8,9,10000000,10,1000000,150000,9000000,1000000,57],
                [2018,4,4, 8000000,24, 333500,120000, 5336000, 2664000,59],
                [2019,7,3,40000000,20,2000000,600000,2000000,38000000,58],
                [2021,12,10,2000000,10, 200000,30000, 1600000,400000,56]])

# distance matrix
D = pairwise_distances(data, metric='euclidean')


# split into 2 clusters
M, C = kMedoids(D, 2)

print('medoids:')
for point_idx in M:
    print( data[point_idx] )

print('')
print('clustering result:')
for label in C:
    for point_idx in C[label]:
        print('label {0} : {1}'.format(label, data[point_idx]))

email : contohprogram.com@gmail.com
WA : +6289671400363