import pandas as pd import random import matplotlib.pyplot as plt import math import numpy as np # calculates euclidean distance between vectors 1 and 2 which can have any number of dimmensions (as long as they both have same number) def compute_euclidean_distance(vec_1, vec_2): distance = 0 # initialise distance between points as 0 for a, b in zip(vec_1, vec_2): # for each element of vector 1 and 2 distance = distance + ((a - b) ** 2) # difference squared added to total distance distance = math.sqrt(distance) # root of distance gives final answer return distance # creates k points with coordinates in the range of the data set passed def initialise_centroids(dataset, k=2): rangeMin = math.floor(np.amin(dataset)) # finds the lowest value in the data (and rounds down to int) rangeMax = math.ceil(np.amax(dataset)) # finds max val in data (and rounds up to int) centroids = np.array([random.randint(rangeMin, rangeMax), random.randint( rangeMin, rangeMax), random.randint(rangeMin, rangeMax), random.randint(rangeMin, rangeMax)]) # initialise at least one centroid as a 4D vector, with random values for each in the range established above # loop to create and append as many other centroids are needed to match k for i in range(k - 1): centroids = np.vstack([centroids, np.array([random.randint(rangeMin, rangeMax), random.randint( rangeMin, rangeMax), random.randint(rangeMin, rangeMax), random.randint(rangeMin, rangeMax)])])# create centroid as a 4D vector, with random values for each in the range established above and append to matrix return centroids # once points are clustered, centroids need to be calculated as the center of that cluster def reCalcCentroids(dataset, assignments, numOfCentroids): tally = np.zeros(numOfCentroids) # an array that tracks the number of points assigned to each centroid - needed to find mean averages = np.zeros((numOfCentroids, 4)) # create a 4xNumCentroids matrix # iterates through each data point for i in range(dataset.shape[0]): tally[assignments[i]] = tally[assignments[i]] + 1 # increase tally for whichever centroid data point i is assigned to # sums up the total for each of the 4 dimensions - this is later divided by number of points to find the mean averages[assignments[i]][0] = averages[assignments[i]][0] + dataset[i][0] averages[assignments[i]][1] = averages[assignments[i]][1] + dataset[i][1] averages[assignments[i]][2] = averages[assignments[i]][2] + dataset[i][2] averages[assignments[i]][3] = averages[assignments[i]][3] + dataset[i][3] for i in range(averages.shape[0]): if tally[i] != 0: # if the tally is zero then we don't want to perform the division because we'd have to divide by 0 which = problems # divide each dimensions total by the number of points to get the mean averages[i][0] = averages[i][0] / tally[i] averages[i][1] = averages[i][1] / tally[i] averages[i][2] = averages[i][2] / tally[i] averages[i][3] = averages[i][3] / tally[i] return averages # for a given point, returns the index in centroids array of the centroid it is closest to def assignPoint(point, centroids): values = np.array([compute_euclidean_distance(point, centroids[0])]) # create new array values that will store distances to all centroids - calculates distance to centroid 0 for i in range(centroids.shape[0] - 1): # for the remaining centroids values = np.vstack( [values, compute_euclidean_distance(point, centroids[i + 1])]) # calculate the euclidean distance between the point and centroid i(+ 1) and then append that distance to the values array index_min = np.argmin(values) # the index of the centroid that is closest is the index of the lowest number in the values array return index_min # assigns all data points to a centroid and returns those assignments as an array of indexes # e.g. if there are two centroids called 0 and 1 then the array might look something like [0, 1, 1, 0, 0, 1, 1, etc.] def runCycle(dataset, centroids): assignments = np.empty(dataset.shape[0], dtype=int) for i in range(dataset.shape[0]): assignments[i] = assignPoint(dataset[i], centroids) return assignments def kmeans(dataset, k=2): sumDistances = [] # keeps track of the objective and is appended each cycle with the current sum distances from centroids centroids = initialise_centroids(dataset, k) # creates k centroids in space ocupied by the dataset assignmentsOld = np.empty(dataset.shape[0], dtype=int) # initialise array Exit = False # while loop will continue to loop until this flag is true while Exit == False: assignmentsNew = runCycle(dataset, centroids) # runs a cycle and assigns all of the points to a centroid if np.array_equal(assignmentsOld, assignmentsNew): # if no points have been assigned to a different centroid then the clustering is complete and the program can exit Exit = True else: assignmentsOld = assignmentsNew # if not, store the new assignments to check against after the next iteration for item in centroids: # for each centroid if np.count_nonzero(item) == 0: #if the centroid = [0, 0, 0, 0] return kmeans(dataset, k) # re-run the process because there has been an error in the initial assignments centroids = reCalcCentroids(dataset, assignmentsNew, k) # re-calculate the centroids to be the mean of their cluster sumDistances.append(objFunction(dataset, centroids, assignmentsOld)) # append the array with the sum of the distance of all points to their assigned centroid - this is graphed later to show the progress of the clustering cluster_assigned = assignmentsNew # return the current assignments as the final clusters plotLine(sumDistances) # make a plot of the objective function over each iteration return [centroids, cluster_assigned] # works out the sum of the distances from their assigned centroid def objFunction(dataset, centroids, assignments): sumDistance = 0 # initialise total as 0 for i in range(dataset.shape[0]): # for each data point sumDistance += compute_euclidean_distance( dataset[i], centroids[assignments[i]]) # calculate the data points euclidean distance from the centroid it is assigned to - found from the assignments array return sumDistance # plots objective function over each iteration of k means def plotLine(data): figur = plt.figure() # x = range from 0 to number of cycles of k means that were run x = list(range(0, len(data))) # y = the sum distances for each cycle y = data #plot data points plt.plot(x, y, 'o') #plot the line to connect these points plt.plot(x, y) #label axes ax1 = figur.add_subplot() ax1.set_ylabel('Sum Distance to Centroids') ax2 = figur.add_subplot() ax2.set_xlabel('Iteration') # plots scatter graph of data points with each cluster's points appearing in a different colour - centroids appear as black points def plotGraph(dataset, centroids, cluster_assigned, y_Axis): fig = plt.figure() dicts = {0: 'height', 1: 'tail length', 2: 'leg length', 3: 'nose circumference'} # dictionary used for labelling y axis X = dataset[dicts[0]].as_matrix() # extract heights from dataset Y = dataset[dicts[y_Axis]].as_matrix() # extract y axis variable by referencing dicts (could be 'tail length', 'leg length' or 'node circumfrence') # each cluster should have different colour to visual distinguish them so each data point checked to see which cluster it belongs to, then the colour of that cluster is used when plotting point # this can handle up to 6 clusters after which point we run out of colours and any additional will just be yellow for i in range(X.shape[0]): if cluster_assigned[i] == 0: col = 'b' elif cluster_assigned[i] == 1: col = 'g' elif cluster_assigned[i] == 2: col = 'r' elif cluster_assigned[i] == 3: col = 'c' elif cluster_assigned[i] == 4: col = 'm' else: col = 'y' plt.scatter(X[i], Y[i], color=col) # plot point i with colour of its assigned cluster #plots centroids in black for i in range(centroids.shape[0]): plt.scatter(centroids[i][0], centroids[i][y_Axis], color='black') # label axes with height on x axis and y axis var found from dictionary lookup ax1 = fig.add_subplot() ax1.set_ylabel(dicts[y_Axis]) ax2 = fig.add_subplot() ax2.set_xlabel(dicts[0]) plt.show() # actual code runs from here: # imports the data from the csv file dataset = pd.read_csv('Task2Data.csv') dataMatrix = dataset.to_numpy() # gets a value of k from the user any value of k works but if k > 6 not enough unique colours to plot choice = int(input('Enter value for k\n')) ans = kmeans(dataMatrix, choice) # returns array where 0th item is centroids of clusters and 1st item is array of assignments for each point to a cluster # gets users choice for vaiable to be plotted on Y axis, height is always on x axis - choice is an int which is passed to plotGraph where dict lookup gets text string choice = int(input('Enter number choice for data on Y-Axis:\n 1. Tail Length\n 2.Leg Length\n 3.Nose Circumference\n')) plotGraph(dataset, ans[0], ans[1], choice) # plot graph of clusters