$30
KMeans Clustering
KMeans is trying to solve the following optimization problem:
argminS∑i=1K∑xj∈Si||xj−μi||2argminS∑i=1K∑xj∈Si||xj−μi||2
where one needs to partition the N observations into K clusters: S={S1,S2,…,SK}S={S1,S2,…,SK} and each cluster has μiμi as its center.
1.1 pairwise distance
In this section, you are asked to implement pairwise_dist function.
Given X∈RNxDX∈RNxD and Y∈RMxDY∈RMxD, obtain the pairwise distance matrix dist∈RNxMdist∈RNxM using the euclidean distance metric, where disti,j=||Xi−Yj||2disti,j=||Xi−Yj||2.
DO NOT USE FOR LOOP in your implementation -- they are slow and will make your code too slow to pass our grader. Use array broadcasting instead.
1.2 KMeans Implementation
In this section, you are asked to implement _init_centers _update_assignment , _update_centers [] and _get_loss function .
For the function signature, please see the corresponding doc strings.
1.3 Find the optimal number of clusters
In this section, you are asked to implement find_optimal_num_clusters function.
You will now use the elbow method to find the optimal number of clusters.
1.4 Autograder test to find centers for data points
To obtain these , you need to be pass the tests set up in the autograder. These will test the centers created by your implementation. Be sure to upload the correct files to obtain these points.
In [2]:
class KMeans(object):
def __init__(self): #No need to implement
pass
def pairwise_dist(self, x, y): # [5 pts]
"""
Args:
x: N x D numpy array
y: M x D numpy array
Return:
dist: N x M array, where dist2[i, j] is the euclidean distance between
x[i, :] and y[j, :]
"""
#l2 norm, ord=2
dist = np.linalg.norm(x[:, np.newaxis, :] - y, ord=2, axis=2)
return dist
# raise NotImplementedError
def _init_centers(self, points, K, **kwargs): #
"""
Args:
points: NxD numpy array, where N is # points and D is the dimensionality
K: number of clusters
kwargs: any additional arguments you want
Return:
centers: K x D numpy array, the centers.
"""
#K random indices from points
indices = np.random.choice(points.shape[0], size=K, replace=False)
#form the numpy array with these indices
centers = points[indices,:]
return centers
# raise NotImplementedError
def _update_assignment(self, centers, points): #
"""
Args:
centers: KxD numpy array, where K is the number of clusters, and D is the dimension
points: NxD numpy array, the observations
Return:
cluster_idx: numpy array of length N, the cluster assignment for each point
Hint: You could call pairwise_dist() function.
"""
#distances between points and all cluster centers
distances = self.pairwise_dist(points,centers)
#index of minimum distance for each row
cluster_idx = np.argmin(distances,axis=1)
return cluster_idx
# raise NotImplementedError
def _update_centers(self, old_centers, cluster_idx, points): #
"""
Args:
old_centers: old centers KxD numpy array, where K is the number of clusters, and D is the dimension
cluster_idx: numpy array of length N, the cluster assignment for each point
points: NxD numpy array, the observations
Return:
centers: new centers, K x D numpy array, where K is the number of clusters, and D is the dimension.
"""
K,D = old_centers.shape[0], old_centers.shape[1]
#intialize centers as zero array
centers = np.zeros((K,D))
for i in range(K):
#find mean of all points having i as cluster idx
centers[i] = np.mean(points[cluster_idx==i, :], axis=0)
return centers
# raise NotImplementedError
def _get_loss(self, centers, cluster_idx, points): #
"""
Args:
centers: KxD numpy array, where K is the number of clusters, and D is the dimension
cluster_idx: numpy array of length N, the cluster assignment for each point
points: NxD numpy array, the observations
Return:
loss: a single float number, which is the objective function of KMeans.
"""
#find squared distances between all points and cluster centers
distances = np.linalg.norm(points[:, np.newaxis, :] - centers, ord=2, axis=2) ** 2
#select distance from cluster center
distance_from_cluster_center = distances[np.arange(len(distances)), cluster_idx]
#loss is sum of all these distances
loss = np.sum(distance_from_cluster_center)
return loss
# raise NotImplementedError
def __call__(self, points, K, max_iters=100, abs_tol=1e-16, rel_tol=1e-16, verbose=False, **kwargs):
"""
Args:
points: NxD numpy array, where N is # points and D is the dimensionality
K: number of clusters
max_iters: maximum number of iterations (Hint: You could change it when debugging)
abs_tol: convergence criteria w.r.t absolute change of loss
rel_tol: convergence criteria w.r.t relative change of loss
verbose: boolean to set whether method should print loss (Hint: helpful for debugging)
kwargs: any additional arguments you want
Return:
cluster assignments: Nx1 int numpy array
cluster centers: K x D numpy array, the centers
loss: final loss value of the objective function of KMeans
"""
centers = self._init_centers(points, K, **kwargs)
for it in range(max_iters):
cluster_idx = self._update_assignment(centers, points)
centers = self._update_centers(centers, cluster_idx, points)
loss = self._get_loss(centers, cluster_idx, points)
K = centers.shape[0]
if it:
diff = np.abs(prev_loss - loss)
if diff < abs_tol and diff / prev_loss < rel_tol:
break
prev_loss = loss
if verbose:
print('iter %d, loss: %.4f' % (it, loss))
return cluster_idx, centers, loss
def find_optimal_num_clusters(self, data, max_K=15): #
"""Plots loss values for different number of clusters in K-Means
Args:
image: input image of shape(H, W, 3)
max_K: number of clusters
Return:
losses: an array of loss denoting the loss of each number of clusters
"""
losses = []
#for each value of K, find the loss and append to array
for i in range(1,max_K+1):
cluster_idx, centers, loss = self.__call__(data, i)
losses.append(loss)
return losses
# raise NotImplementedError
In [3]:
# Helper function for checking the implementation of pairwise_distance fucntion. Please DO NOT change this function
# TEST CASE
x = np.random.randn(2, 2)
y = np.random.randn(3, 2)
print("*** Expected Answer ***")
print("""==x==
[[ 1.62434536 -0.61175641]
[-0.52817175 -1.07296862]]
==y==
[[ 0.86540763 -2.3015387 ]
[ 1.74481176 -0.7612069 ]
[ 0.3190391 -0.24937038]]
==dist==
[[1.85239052 0.19195729 1.35467638]
[1.85780729 2.29426447 1.18155842]]""")
print("\n*** My Answer ***")
print("==x==")
print(x)
print("==y==")
print(y)
print("==dist==")
print(KMeans().pairwise_dist(x, y))
*** Expected Answer ***
==x==
[[ 1.62434536 -0.61175641]
[-0.52817175 -1.07296862]]
==y==
[[ 0.86540763 -2.3015387 ]
[ 1.74481176 -0.7612069 ]
[ 0.3190391 -0.24937038]]
==dist==
[[1.85239052 0.19195729 1.35467638]
[1.85780729 2.29426447 1.18155842]]
*** My Answer ***
==x==
[[ 1.62434536 -0.61175641]
[-0.52817175 -1.07296862]]
==y==
[[ 0.86540763 -2.3015387 ]
[ 1.74481176 -0.7612069 ]
[ 0.3190391 -0.24937038]]
==dist==
[[1.85239052 0.19195729 1.35467638]
[1.85780729 2.29426447 1.18155842]]
In [4]:
def image_to_matrix(image_file, grays=False):
"""
Convert .png image to matrix
of values.
params:
image_file = str
grays = Boolean
returns:
img = (color) np.ndarray[np.ndarray[np.ndarray[float]]]
or (grayscale) np.ndarray[np.ndarray[float]]
"""
img = plt.imread(image_file)
# in case of transparency values
if len(img.shape) == 3 and img.shape[2] 3:
height, width, depth = img.shape
new_img = np.zeros([height, width, 3])
for r in range(height):
for c in range(width):
new_img[r, c, :] = img[r, c, 0:3]
img = np.copy(new_img)
if grays and len(img.shape) == 3:
height, width = img.shape[0:2]
new_img = np.zeros([height, width])
for r in range(height):
for c in range(width):
new_img[r, c] = img[r, c, 0]
img = new_img
return img
In [5]:
image_values = image_to_matrix('./images/bird_color_24.png')
r = image_values.shape[0]
c = image_values.shape[1]
ch = image_values.shape[2]
# flatten the image_values
image_values = image_values.reshape(r*c,ch)
k = 6 # feel free to change this value
cluster_idx, centers, loss = KMeans()(image_values, k)
updated_image_values = np.copy(image_values)
# assign each pixel to cluster mean
for i in range(0,k):
indices_current_cluster = np.where(cluster_idx == i)[0]
updated_image_values[indices_current_cluster] = centers[i]
updated_image_values = updated_image_values.reshape(r,c,ch)
plt.figure(None,figsize=(9,12))
plt.imshow(updated_image_values)
plt.show()
In [6]:
KMeans().find_optimal_num_clusters(image_values)
Out[6]:
[25575.970952916756,
14136.693323245287,
10225.60985715669,
6993.510281152104,
6388.54308345205,
4749.894457724676,
3383.379286682485,
3418.0366633949584,
2738.2662250749277,
2554.129469081243,
2065.325028335666,
1929.4098314621665,
1965.2534688501296,
1706.074071535149,
1580.6903867544215]
Silhouette Coefficient Evaluation
The average silhouette of the data is another useful criterion for assessing the natural number of clusters. The silhouette of a data instance is a measure of how closely it is matched to data within its cluster and how loosely it is matched to data of the neighbouring cluster.
The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. If most objects have a high value, then the clustering configuration is appropriate. If many points have a low or negative value, then the clustering configuration may have too many or too few clusters.
In [7]:
def intra_cluster_dist(cluster_idx, data, labels): # [4 pts]
"""
Calculates the average distance from a point to other points within the same cluster
Args:
cluster_idx: the cluster index (label) for which we want to find the intra cluster distance
data: NxD numpy array, where N is # points and D is the dimensionality
labels: 1D array of length N where each number indicates of cluster assignement for that point
Return:
intra_dist_cluster: 1D array where the i_th entry denotes the average distance from point i
in cluster denoted by cluster_idx to other points within the same cluster
"""
# indices of rows with cluster==cluster_idx
indices = [i for i in range(len(labels)) if labels[i]==cluster_idx]
# find rows
rows_of_interest = data[indices]
# initialize intra_dist_cluster
intra_dist_cluster = np.zeros((rows_of_interest.shape[0]))
for i in range(intra_dist_cluster.shape[0]):
x = np.concatenate((rows_of_interest[:i,:], rows_of_interest[i+1:,:]))
y = rows_of_interest[i, :]
intra_dist_cluster[i] = np.mean(np.sqrt(np.sum((x - y[None,:])**2, axis = 1)))
return intra_dist_cluster
# raise NotImplementedError
def inter_cluster_dist(cluster_idx, data, labels): #
"""
Calculates the average distance from one cluster to the nearest cluster
Args:
cluster_idx: the cluster index (label) for which we want to find the intra cluster distance
data: NxD numpy array, where N is # points and D is the dimensionality
labels: 1D array of length N where each number indicates of cluster assignement for that point
Return:
inter_dist_cluster: 1D array where the i-th entry denotes the average distance from point i in cluster
denoted by cluster_idx to the nearest neighboring cluster
"""
# find number of distinct clusters
num_clusters = len(np.unique(labels))
# separate data points by cluster
separate_data = []
for j in range(num_clusters):
indices = [i for i in range(len(labels)) if labels[i] == j]
separate_data.append(data[indices])
# indices of rows with cluster==cluster_idx
indices = [i for i in range(len(labels)) if labels[i] == cluster_idx]
# find rows
rows_of_interest = data[indices]
# initialize inter_dist_cluster
inter_dist_cluster = np.zeros((rows_of_interest.shape[0]))
# find min. average distance from each point to nearest cluster
for i in range(rows_of_interest.shape[0]):
y = rows_of_interest[i, :]
distances = []
for j in range(len(separate_data)):
if j != cluster_idx:
x = separate_data[j]
distances.append(np.mean(np.sqrt(np.sum((x - y[None, :]) ** 2, axis=1))))
inter_dist_cluster[i] = min(distances)
return inter_dist_cluster
# raise NotImplementedError
def silhouette_coefficient(data, labels): #
"""
Finds the silhouette coefficient of the current cluster assignment
Args:
data: NxD numpy array, where N is # points and D is the dimensionality
labels: 1D array of length N where each number indicates of cluster assignement for that point
Return:
silhouette_coefficient: Silhouette coefficient of the current cluster assignment
"""
#unique clusters
clusters = np.unique(labels)
#initialize silhouette coefficient
SC = 0
#for each value of cluster_idx, find inter and intra cluster distance, add SC(i) to SC
for cluster_idx in clusters:
intra_dist_cluster = intra_cluster_dist(cluster_idx, data, labels)
inter_dist_cluster = inter_cluster_dist(cluster_idx, data, labels)
#max of inter and intra cluster distance for each point
max_of_both = np.max(np.array([intra_dist_cluster, inter_dist_cluster]), axis=0)
#calculate SC of each point
S = [x/y for x, y in zip(inter_dist_cluster - intra_dist_cluster, max_of_both)]
#add sum of all SCs for that cluster to SC
SC+= np.sum(S)
#SC is average of sum of all individual SCs
SC/= data.shape[0]
return SC
# raise NotImplementedError
In [8]:
def plot_silhouette_coefficient(data, max_K=15):
"""
Plot silhouette coefficient for different number of clusters, no need to implement
"""
clusters = np.arange(2, max_K+1)
silhouette_coefficients = []
for k in range(2, max_K+1):
labels, _, _ = KMeans()(data, k)
silhouette_coefficients.append(silhouette_coefficient(data, labels))
plt.plot(clusters, silhouette_coefficients)
return silhouette_coefficients
data = np.random.rand(200,3) * 100
plot_silhouette_coefficient(data)
Out[8]:
[0.27434894586234465,
0.23974317309219537,
0.25170126787277375,
0.29110188891166133,
0.32392134092333413,
0.2978439345698108,
0.24118691265277048,
0.2861673492358512,
0.27509945216139964,
0.2710429481302329,
0.27049817823622974,
0.28742447415210476,
0.28737886551571834,
0.2721522663631721]
Limitation of K-Means
One of the limitations of K-Means Clustering is that it dependes largely on the shape of the dataset. A common example of this is trying to cluster one circle within another (concentric circles). A K-means classifier will fail to do this and will end up effectively drawing a line which crosses the circles. You can visualize this limitation in the cell below.
In [9]:
# visualize limitation of kmeans, do not have to implement
from sklearn.datasets.samples_generator import (make_circles, make_moons)
X1, y1 = make_circles(factor=0.5, noise=0.05, n_samples=1500)
X2, y2 = make_moons(noise=0.05, n_samples=1500)
def visualise(X, C, K):# Visualization of clustering. You don't need to change this function
fig, ax = plt.subplots()
ax.scatter(X[:, 0], X[:, 1], c=C,cmap='rainbow')
plt.title('Visualization of K = '+str(K), fontsize=15)
plt.show()
pass
cluster_idx1, centers1, loss1 = KMeans()(X1, 2)
visualise(X1, cluster_idx1, 2)
cluster_idx2, centers2, loss2 = KMeans()(X2, 2)
visualise(X2, cluster_idx2, 2)
2. EM algorithm
2.1 Performing EM Update
A univariate Gaussian Mixture Model (GMM) has two components, both of which have their own mean and standard deviation. The model is defined by the following parameters:
z∼Bernoulli(θ)z∼Bernoulli(θ)
p(x|z=0)∼N(μ,σ)p(x|z=0)∼N(μ,σ)
p(x|z=1)∼N(2μ,3σ)p(x|z=1)∼N(2μ,3σ)
For a dataset of N datapoints, find the following:
2.1.1. Write the marginal probability of x, i.e. p(x)p(x)
2.1.2. E-Step: Compute the posterior probability, i.e, p(zi=k|xi)p(zi=k|xi), where k = {0,1}
2.1.3. M-Step: Compute the updated value of μμ (You can keep σσ fixed for this) [3pts]
2.1.4. M-Step: Compute the updated value for σσ (You can keep μμ fixed for this) [3pts]
2.1.1 Marginal probability of x, i.e. p(x)p(x)
Given: A univariate Gaussian mixture model with two components, defined by their parameters as:
z∼Bernoulli(θ)z∼Bernoulli(θ)
p(x|z=0)∼N(μ,σ)p(x|z=0)∼N(μ,σ)
p(x|z=1)∼N(2μ,3σ)p(x|z=1)∼N(2μ,3σ)
Note: Considering p(z=0)=θp(z=0)=θ, and σσ and 3σ3σ as standard deviation as suggested by TAs on piazza:
Marginal probability of x can be calculated as:
p(x)=∑zp(z)∗p(x|z)=p(z=0)∗p(x|z=0)+p(z=1)∗p(x|z=1)=θ∗N(μ,σ)+(1−θ)∗N(2μ,3σ)p(x)=∑zp(z)∗p(x|z)=p(z=0)∗p(x|z=0)+p(z=1)∗p(x|z=1)=θ∗N(μ,σ)+(1−θ)∗N(2μ,3σ)
2.1.2 E-Step: Posterior probability, i.e, p(zi=k|xi)p(zi=k|xi), where k = {0,1}
For k=0, we get:
p(zi=0|xi)=p(zi=0)∗p(xi|zi=0)p(x)=θ∗N(μ,σ)θ∗N(μ,σ)+(1−θ)∗N(2μ,3σ)p(zi=0|xi)=p(zi=0)∗p(xi|zi=0)p(x)=θ∗N(μ,σ)θ∗N(μ,σ)+(1−θ)∗N(2μ,3σ)
Similarly, for k=0, we get:
p(zi=1|xi)=p(zi=1)∗p(xi|zi=1)p(x)=(1−θ)∗N(2μ,3σ)θ∗N(μ,σ)+(1−θ)∗N(2μ,3σ)p(zi=1|xi)=p(zi=1)∗p(xi|zi=1)p(x)=(1−θ)∗N(2μ,3σ)θ∗N(μ,σ)+(1−θ)∗N(2μ,3σ)
2.1.3 M-Step: Computing the updated value of μμ
To find the updated value of μμ, we need to solve the optimization problem:
θnew=argmaxθ∑zp(Z|X,θold)∗ln(p(X,Z|θ)θnew=argmaxθ∑zp(Z|X,θold)∗ln(p(X,Z|θ)
To solve this optimization problem, we will find its derivative and equate it to 0.
Representing p(Z=1|X,θoldp(Z=1|X,θold as τ(z=1)τ(z=1) and p(Z=0|X,θoldp(Z=0|X,θold as τ(z=0)τ(z=0) and using the value of the derivative of the Gaussian pdf w.r.t. μμ, we get:
∂∂μ∑zp(Z|X,θold)∗ln(p(X,Z|θ)=∂∂μ[τ(z=1)∗ln((1−θ)∗N(2μ,3σ))+τ(z=0)∗ln(θ∗N(μ,σ))]=τ(z=1)∗∂N(2μ,3σ)∂μ+τ(z=0)∗∂N(μ,σ)∂μ=τ(z=1)∗2(x−2μ)9σ2+τ(z=0)∗(x−μ)σ2∂∂μ∑zp(Z|X,θold)∗ln(p(X,Z|θ)=∂∂μ[τ(z=1)∗ln((1−θ)∗N(2μ,3σ))+τ(z=0)∗ln(θ∗N(μ,σ))]=τ(z=1)∗∂N(2μ,3σ)∂μ+τ(z=0)∗∂N(μ,σ)∂μ=τ(z=1)∗2(x−2μ)9σ2+τ(z=0)∗(x−μ)σ2
Equating the above value of derivative to 0, we get:
τ(z=1)∗2(x−2μ)9σ2+τ(z=0)∗(x−μ)σ2=0τ(z=1)∗2(x−2μ)9σ2+τ(z=0)∗(x−μ)σ2=0
2x∗τ(z=1)−4μ∗τ(z=1)+9x∗τ(z=0)−9μ∗τ(z=0)=02x∗τ(z=1)−4μ∗τ(z=1)+9x∗τ(z=0)−9μ∗τ(z=0)=0
x(2∗τ(z=1)+9∗τ(z=0))=μ(9∗τ(z=0)+4∗τ(z=1))x(2∗τ(z=1)+9∗τ(z=0))=μ(9∗τ(z=0)+4∗τ(z=1))
This gives us the updated value of μμ as:
μ=x(2τ(z=1)+9τ(z=0))9τ(z=0)+4τ(z=1)μ=x(2τ(z=1)+9τ(z=0))9τ(z=0)+4τ(z=1)
2.1.4 M-Step: Computing the updated value for σσ:
For this part, we are solving the same optimization problem, but this time, we will find the partial derivative w.r.t. σσ.
Representing p(Z=1|X,θoldp(Z=1|X,θold as τ(z=1)τ(z=1) and p(Z=0|X,θoldp(Z=0|X,θold as τ(z=0)τ(z=0) and using the value of the derivative of the Gaussian pdf w.r.t. σσ, we get:
∂∂σ∑zp(Z|X,θold)∗ln(p(X,Z|θ)=∂∂σ[τ(z=1)∗ln((1−θ)∗N(2μ,3σ))+τ(z=0)∗ln(θ∗N(μ,σ))]=τ(z=1)∗∂N(2μ,3σ)∂σ+τ(z=0)∗∂N(μ,σ)∂σ=τ(z=1)∗(x−2μ)2−9σ29σ3+τ(z=0)∗(x−μ)2−σ2σ3∂∂σ∑zp(Z|X,θold)∗ln(p(X,Z|θ)=∂∂σ[τ(z=1)∗ln((1−θ)∗N(2μ,3σ))+τ(z=0)∗ln(θ∗N(μ,σ))]=τ(z=1)∗∂N(2μ,3σ)∂σ+τ(z=0)∗∂N(μ,σ)∂σ=τ(z=1)∗(x−2μ)2−9σ29σ3+τ(z=0)∗(x−μ)2−σ2σ3
Equating the above value of derivative to 0, we get:
τ(z=1)∗(x−2μ)2−9σ29σ3+τ(z=0)∗(x−μ)2−σ2σ3=0τ(z=1)∗(x−2μ)2−9σ29σ3+τ(z=0)∗(x−μ)2−σ2σ3=0
\
τ(z=1)∗((x−2μ)2−9σ2)+9τ(z=0)∗((x−μ)2−σ2)=0τ(z=1)∗((x−2μ)2−9σ2)+9τ(z=0)∗((x−μ)2−σ2)=0
τ(z=1)(x−2μ)2−9τ(z=1)∗σ2+9τ(z=0)(x−μ)2−9τ(z=0)∗σ2=0τ(z=1)(x−2μ)2−9τ(z=1)∗σ2+9τ(z=0)(x−μ)2−9τ(z=0)∗σ2=0
σ2=τ(z=1)(x−2μ)2+9τ(z=0)(x−μ)29(τ(z=1)+τ(z=0))σ2=τ(z=1)(x−2μ)2+9τ(z=0)(x−μ)29(τ(z=1)+τ(z=0))
This gives us the updated value of σσ as:
σ=13τ(z=1)(x−2μ)2+9τ(z=0)(x−μ)2τ(z=1)+τ(z=0)−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−√σ=13τ(z=1)(x−2μ)2+9τ(z=0)(x−μ)2τ(z=1)+τ(z=0)
2.2 EM Algorithm in ABO Blood Groups
In the ABO blood group system, each individual has a phenotype and a genotype as shown below. The genotype is made of underlying alleles (A, B, O).
PhenotypeAAABBBOABGenotypeAAAOOABBBOOBOOABPhenotypeGenotypeAAAAAOAOABBBBBOBOBOOOABAB
In a research experiment, scientists wanted to model the distribution of the genotypes of the population. They collected the phenotype information from the participants as this could be directly observed from the individual's blood group. The scientists, however want to use this data to model the underlying genotype information. In order to help them obtain an understanding, you suggest using the EM algorithm to find out the genotype distribution.
You know that the probability of that an allele is present in an individual is independent of the probability of any other allele, i.e, P(AO)=P(OA)=P(A)∗P(O)P(AO)=P(OA)=P(A)∗P(O) and so on. Also note that the genotype pairs: (AO, OA) and (BO, OB) are identical and can be treated as AO, BO respectively. You also know that the alleles follow a multinomial distribution.
p(O)=1−p(A)−p(B)p(O)=1−p(A)−p(B)
Let nA,nB,nO,nABnA,nB,nO,nAB be the number of individuals with the phenotypes A, B, O and AB respectively.\ Let nAA,nAO,nBB,nBO,nABnAA,nAO,nBB,nBO,nAB be the numbers of individuals with genotypes AA, AO, BB, BO and AB respectively.\ The satisfy the following conditions:
nA=nAA+nAOnA=nAA+nAO
nB=nBB+nBOnB=nBB+nBO
nA+nB+nO+nAB=nnA+nB+nO+nAB=n
Given:
pA=pB=pO=13pA=pB=pO=13
nA=186,nB=38,nO=284,nAB=13nA=186,nB=38,nO=284,nAB=13
2.2.1. In the E step, compute the value of nAA,nAO,nBB,nBOnAA,nAO,nBB,nBO. [5pts]
2.2.2. In the M step, find the new value of pA,pBpA,pB given the updated values from E-step above. (Round off the answer to 3 decimal places) [5pts]
2.2.1 We have:
pAA=p2A=(13)2=19pAA=pA2=(13)2=19
pAO=2pApO=2(13)2=29pAO=2pApO=2(13)2=29
pBB=p2B=(13)2=19pBB=pB2=(13)2=19
pBO=2pBpO=2(13)2=29pBO=2pBpO=2(13)2=29
pOO=p2O=(13)2=19pOO=pO2=(13)2=19
pAB=2pApB=(13)2=29pAB=2pApB=(13)2=29
We can calculate:
nAA=nA∗pAApAA+pAO=186∗19∗119+29=186∗19∗93=1863=62nAA=nA∗pAApAA+pAO=186∗19∗119+29=186∗19∗93=1863=62
nAO=nA∗pAOpAA+pAO=186∗29∗119+29=186∗29∗93=186∗23=124nAO=nA∗pAOpAA+pAO=186∗29∗119+29=186∗29∗93=186∗23=124
nBB=nB∗pBBpBB+pBO=38∗19∗119+29=38∗19∗93=383≈13nBB=nB∗pBBpBB+pBO=38∗19∗119+29=38∗19∗93=383≈13
nBO=nB∗pBOpBB+pBO=38∗29∗119+29=38∗29∗93=38∗23≈25nBO=nB∗pBOpBB+pBO=38∗29∗119+29=38∗29∗93=38∗23≈25
2.2.2 For the M-step, we will solve an optimization problem, given the constraint pA+pB+pO=1pA+pB+pO=1.
We have, the log-likelihhod function:
LL(pA,pB,pC)nBO∗logpBO+nOO∗logpOO+nAB∗logpAB=log(pnAAAA+pnAOAO+pnBBBB+pnBOBO+pnOOOO+pnABAB)=nAA∗logpAA+nAO∗logpAO+nBB∗logpBB+LL(pA,pB,pC)=log(pAAnAA+pAOnAO+pBBnBB+pBOnBO+pOOnOO+pABnAB)=nAA∗logpAA+nAO∗logpAO+nBB∗logpBB+nBO∗logpBO+nOO∗logpOO+nAB∗logpAB
Thus, the Lagrange function would be:
L(pA,pB,pC)=LL(pA,pB,pC)−λ(pA+pB+pO−1)L(pA,pB,pC)=LL(pA,pB,pC)−λ(pA+pB+pO−1)
To solve the optimization problems, we will find the partial derivatives and equate it to 0.
∂L(pA,pB,pC)∂pA=∂∂pA[nAA∗logpAA+nAO∗logpAO+nAB∗logpAB−λ(pA+pB+pO−1)]=2nAApA+2nAOpO2pApO+2nABpB2pApB−λ=2nAA+nAO+nABpA−λ=0∂L(pA,pB,pC)∂pA=∂∂pA[nAA∗logpAA+nAO∗logpAO+nAB∗logpAB−λ(pA+pB+pO−1)]=2nAApA+2nAOpO2pApO+2nABpB2pApB−λ=2nAA+nAO+nABpA−λ=0
∂L(pA,pB,pC)∂pB=∂∂pB[nBB∗logpBB+nBO∗logpBO+nAB∗logpAB−λ(pA+pB+pO−1)]=2nBBpB+2nBOpO2pBpO+2nABpB2pApB−λ=2nBB+nBO+nABpB−λ=0∂L(pA,pB,pC)∂pB=∂∂pB[nBB∗logpBB+nBO∗logpBO+nAB∗logpAB−λ(pA+pB+pO−1)]=2nBBpB+2nBOpO2pBpO+2nABpB2pApB−λ=2nBB+nBO+nABpB−λ=0
∂L(pA,pB,pC)∂pO=∂∂pO[nOO∗logpOO+nAO∗logpAO+nBO∗logpBO−λ(pA+pB+pO−1)]=2nOOpO+2nAOpO2pApO+2nBOpB2pOpB−λ=2nOO+nBO+nAOpO−λ=0∂L(pA,pB,pC)∂pO=∂∂pO[nOO∗logpOO+nAO∗logpAO+nBO∗logpBO−λ(pA+pB+pO−1)]=2nOOpO+2nAOpO2pApO+2nBOpB2pOpB−λ=2nOO+nBO+nAOpO−λ=0
∂L(pA,pB,pC)∂λ=∂∂λ[−λ(pA+pB+pO−1)]=(pA+pB+pO−1)=0∂L(pA,pB,pC)∂λ=∂∂λ[−λ(pA+pB+pO−1)]=(pA+pB+pO−1)=0
Adding the partial derivatives of L(pA,pB,pC)L(pA,pB,pC) w.r.t. pApA, pBpB and pCpC, we get:
2(nAA+nAO+nBB+nBO+nOO+nAB)−λ(pA+pB+pO)=02(nAA+nAO+nBB+nBO+nOO+nAB)−λ(pA+pB+pO)=0
Since the sum of all frequencies is nn and the sum of all probabilities is 1, we get:
2n−λ=02n−λ=0
λ=2n=2∗(186+38+284+13)=2∗521=1042λ=2n=2∗(186+38+284+13)=2∗521=1042
Substituting the value of λλ in the partial derivatives of L(pA,pB,pC)L(pA,pB,pC) w.r.t. pApA and pBpB, we get:
2nAA+nAO+nABpA−λ=02nAA+nAO+nABpA−λ=0
pA=2nAA+nAO+nABλ=2∗62+124+131042=2611042≈0.25pA=2nAA+nAO+nABλ=2∗62+124+131042=2611042≈0.25
Now, for pBpB,
2nBB+nBO+nABpB−λ=02nBB+nBO+nABpB−λ=0
pB=2nBB+nBO+nABλ=2∗13+25+131042=641042≈0.061pB=2nBB+nBO+nABλ=2∗13+25+131042=641042≈0.061
3. GMM implementation [40 + 10 + 5(bonus) pts]
A Gaussian Mixture Model(GMM) is a probabilistic model that assumes all the data points are generated from a mixture of a finite number of Gaussian Distribution. In a nutshell, GMM is a soft clustering algorithm in a sense that each data point is assigned to a cluster with a probability. In order to do that, we need to convert our clustering problem into an inference problem.
Given NN samples X=[x1,x2,…,xN]TX=[x1,x2,…,xN]T, where xi∈RDxi∈RD. Let ππ be a K-dimentional probability distribution and (μk;Σk)(μk;Σk) be the mean and covariance matrix of the kthkth Gaussian distribution in RdRd.
The GMM object implements EM algorithms for fitting the model and MLE for optimizing its parameters. It also has some particular hypothesis on how the data was generated:
Each data point xixi is assigned to a cluster kk with probability of πkπk where ∑Kk=1πk=1∑k=1Kπk=1
Each data point xixi is generated from Multivariate Normal Distribution N(μk,Σk)N(μk,Σk) where μk∈RDμk∈RD and Σk∈RD×DΣk∈RD×D
Our goal is to find a KK-dimension Gaussian distributions to model our data XX. This can be done by learning the parameters π,μπ,μ and ΣΣ through likelihood function. Detailed derivation can be found in our slide of GMM. The log-likelihood function now becomes:
ln p(x1,…,xN|π,μ,Σ)=∑i=1Nln (∑k=1Kπ(k)N(xi|μk,Σk))ln p(x1,…,xN|π,μ,Σ)=∑i=1Nln (∑k=1Kπ(k)N(xi|μk,Σk))
From the lecture we know that MLEs for GMM all depend on each other and the responsibility ττ. Thus, we need to use an iterative algorithm (the EM algorithm) to find the estimate of parameters that maximize our likelihood function. All detailed derivations can be found in the lecture slide of GMM.
E-step: Evaluate the responsibilities
In this step, we need to calculate the responsibility ττ, which is the conditional probability that a data point belongs to a specific cluster kk if we are given the datapoint, i.e. P(zk|x)P(zk|x). The formula for ττ is given below:
τ(zk)=πkN(x|μk,Σk)∑Kj=1πjN(x|μj,Σj),for k=1,…,Kτ(zk)=πkN(x|μk,Σk)∑j=1KπjN(x|μj,Σj),for k=1,…,K
Note that each data point should have one probability for each component/cluster. For this homework, you will work with τ(zk)τ(zk) which has a size of N×KN×K and you should have all the responsibility values in one matrix. We use gamma as ττ in this homework.
M-step: Re-estimate Paramaters
After we obtained the responsibility, we can find the update of parameters, which are given below:
μnewkΣnewkπnewk=∑Nn=1τ(zk)xnNk=1Nk∑n=1Nτ(zk)T(xn−μnewk)T(xn−μnewk)=NkNμknew=∑n=1Nτ(zk)xnNkΣknew=1Nk∑n=1Nτ(zk)T(xn−μknew)T(xn−μknew)πknew=NkN
where Nk=∑Nn=1τ(zk)Nk=∑n=1Nτ(zk). Note that the updated value for μkμk is used when updating ΣkΣk. The multiplication of τ(zk)T(xn−μnewk)Tτ(zk)T(xn−μknew)T is element-wise so it will preserve the dimensions of (xn−μnewk)T(xn−μknew)T.
We repeat E and M steps until the incremental improvement to the likelihood function is small.
Special Notes
For undergraduate students: you may assume that the covariance matrix ΣΣ is a diagonal matrix, which means the features are independent. (i.e. the red intensity of a pixel is independent of its blue intensity, etc).
For graduate students: please assume a full covariance matrix.
The class notes assume that your dataset XX is (D,N)(D,N). However, the homework dataset is (N,D)(N,D) as mentioned on the instructions, so the formula is a little different from the lecture note in order to obtain the right dimensions of parameters.
Hints
DO NOT USE FOR LOOPS OVER N. You can always find a way to avoid looping over the observation data points in our homework problem. If you have to loop over D or K, that would be fine.
You can initiate π(k)π(k) the same for each kk, i.e. π(k)=1K,∀k=1,2,…,Kπ(k)=1K,∀k=1,2,…,K.
In part 3 you are asked to generate the model for pixel clustering of image. We will need to use a multivariate Gaussian because each image will have NN pixels and D=3D=3 features, which correspond to red, green, and blue color intensities. It means that each image is a (N×3)(N×3) dataset matrix. In the following parts, remember D=3D=3 in this problem.
To avoid using for loops in your code, we recommend you take a look at the concept Array Broadcasting in Numpy. Also, some calculations that required different shapes of arrays can be achieved by broadcasting.
Be careful of the dimensions of your parameters. Before you test anything on the autograder, please look at the instructions below on the shapes of the variables you need to output. This could enhance the functionality of your code and help you debug. Also notice that a numpy array in shape (N,1)(N,1) is NOT the same as that in shape (N,)(N,) so be careful and consistent on what you are using. You can see the detailed explanation here. Difference between numpy.array shape (R, 1) and (R,)
The dataset XX: (N,D)(N,D)
μμ: (K,D)(K,D).
ΣΣ: (K,D,D)(K,D,D)
ττ: (N,K)(N,K)
ππ: array of length KK
ll_joint: (N,K)(N,K)
3.1 Helper functions
To facilitate some of the operations in the GMM implementation, we would like you to implement the following three helper functions. In these functions, "logit" refers to an input array of size (N,D)(N,D). Remember the goal of helper functions is to facilitate our calculation so DO NOT USE FOR LOOP ON N.
3.1.1. softmax
Given logit∈RN×Dlogit∈RN×D, calculate prob∈RN×Dprob∈RN×D, where probi,j=exp(logiti,j)∑Dd=1exp(logiti,d)probi,j=exp(logiti,j)∑d=1Dexp(logiti,d).
Note: it is possible that logiti,jlogiti,j is very large, making exp(⋅)exp(⋅) of it to explode. To make sure it is numerically stable, you need to subtract the maximum for each row of logitslogits, and then add it back in your result.
3.1.2. logsumexp
Given logit∈RN×Dlogit∈RN×D, calculate s∈RNs∈RN, where si=log(∑Dj=1exp(logiti,j))si=log(∑j=1Dexp(logiti,j)). Again, pay attention to the numerical problem. You may want to use similar trick as in the softmax function. Note: This function is used in the call() function which is given, so you will not need it in your own implementation. It helps calculate the loss of log-likehood.
3.1.3. Multivariate Gaussian PDF
You should be able to write your own function based on the following formula, and you are NOT allowed to use outside resource packages other than those we provided.
(for undergrads only) normalPDF
Using the covariance matrix as a diagonal matrix with variances of the individual variables appearing on the main diagonal of the matrix and zeros everywhere else means that we assume the features are independent. In this case, the multivariate normal density function simplifies to the expression below:
N(x:μ,Σ)=∏i=1D12πσ2i−−−−√exp(−12σ2i(xi−μi)2)N(x:μ,Σ)=∏i=1D12πσi2exp(−12σi2(xi−μi)2)
where σ2iσi2 is the variance for the ithith feature, which is the diagonal element of the covariance matrix.
(for grads only) multinormalPDF
Given the dataset X∈RN×DX∈RN×D, the mean vector μ∈RDμ∈RD and covariance matrix Σ∈RD×DΣ∈RD×D for a multivariate Gaussian distrubution, calculate the probability p∈RNp∈RN of each data. The PDF is given by
N(X:μ,Σ)=1(2π)D/2|Σ|−1/2exp(−12(X−μ)Σ−1(X−μ)T)N(X:μ,Σ)=1(2π)D/2|Σ|−1/2exp(−12(X−μ)Σ−1(X−μ)T)
where |Σ||Σ| is the determinant of the covariance matrix.
Hints
If you encounter "LinAlgError", you can mitigate your number/array by summing a small value before taking the operation, e.g. np.linalg.inv($\Sigma_k$ + 1e-32). You can arrest and handle such error by using Try and Exception Block in Python.
In the above calculation, you must avoid computing a (N,N)(N,N) matrix. Using the above equation for large N will crash your kernel and/or give you a memory error on Gradescope. Instead, you can do this same operation by calculating (X−μ)Σ−1(X−μ)Σ−1, a (N,D)(N,D) matrix, transpose it to be a (D,N)(D,N) matrix and do an element-wise multiplication with (X−μ)T(X−μ)T, which is also a (D,N)(D,N) matrix. Lastly, you will need to sum over the 0 axis to get a (1,N)(1,N) matrix before proceeding with the rest of the calculation. This uses the fact that doing an element-wise multiplication and summing over the 0 axis is the same as taking the diagonal of the (N,N)(N,N) matrix from the matrix multiplication.
In Numpy implementation for μμ, you can either use a 2-D array with dimension (1,D)(1,D) for each Gaussian Distribution, or a 1-D array with length DD. Same to other array parameters. Both ways should be acceptable but pay attention to the shape mismatch problem and be consistent all the time when you implement such arrays.
3.2 GMM Implementation
Things to do in this problem:
3.2.1. Initialize parameters in _init_components() [5 pts]
Examples of how you can initialize the parameters.
Set the prior probability ππ the same for each class.
Initialize μμ by randomly selecting K numbers of observations as the initial mean vectors, and initialize the covariance matrix with np.eye() for each k. For grads, you can also initialize the ΣΣ by K diagonal matrices. It will become a full matrix after one iteration, as long as you adopt the correct computation.
Other ways of initialization are acceptable and welcome.
3.2.2. Formulate the log-likelihood function _ll_joint()
The log-likelihood function is given by:
ℓ(θ)=∑i=1Nln (∑k=1Kπ(k)N(xi|μk,Σk))ℓ(θ)=∑i=1Nln (∑k=1Kπ(k)N(xi|μk,Σk))
In this part, we will generate a (N,K)(N,K) matrix where each datapoint xi,∀i=1,…,Nxi,∀i=1,…,N has KK log-likelihood numbers. Thus, for each i=1,…,Ni=1,…,N and k=1,…,Kk=1,…,K,
log-likelihood[i,k]=logπk+logN(xi|μk,Σk)log-likelihood[i,k]=logπk+logN(xi|μk,Σk)
Hints:
If you encounter "ZeroDivisionError" or "RuntimeWarning: divide by zero encountered in log", you can mitigate your number/array by summing a small value before taking the operation, e.g. np.log($\pi_k$ + 1e-32).
You need to use the Multivariate Normal PDF function you created in the last part. Remember the PDF function is for each Gaussian Distribution (i.e. for each k) so you need to use a for loop over K.
3.2.3. Setup Iterative steps for EM Algorithm
You can find the detail instruction in the above description box.
Hints:
For E steps, we already get the log-likelihood at _ll_joint() function. This is not the same as responsibilities (ττ), but you should be able to finish this part with just a few lines of code by using _ll_joint() and softmax() defined above.
For undergrads: Try to simplify your calculation for ΣΣ in M steps as you assumed independent components. Make sure you are only taking the diagonal terms of your calculated covariance matrix.
In [13]:
class GMM(object):
def __init__(self, X, K, max_iters = 100): # No need to change
"""
Args:
X: the observations/datapoints, N x D numpy array
K: number of clusters/components
max_iters: maximum number of iterations (used in EM implementation)
"""
self.points = X
self.max_iters = max_iters
self.N = self.points.shape[0] #number of observations
self.D = self.points.shape[1] #number of features
self.K = K #number of components/clusters
#Helper function for you to implement
def softmax(self, logit): # [5pts]
"""
Args:
logit: N x D numpy array
Return:
prob: N x D numpy array. See the above function.
"""
#max of each row in logit
max_logit = np.max(logit, axis=1)
#subtract max of each row from logit
logit-= max_logit[:, None]
#initialize prob
prob = np.exp(logit)
prob/= np.sum(np.exp(logit), axis=1)[:, None]
return prob
# raise NotImplementedError
def logsumexp(self, logit): # [5pts]
"""
Args:
logit: N x D numpy array
Return:
s: N x 1 array where s[i,0] = logsumexp(logit[i,:]). See the above function
"""
#max of each row in logit
max_logit = np.max(logit, axis=1)
#subtract max of each row from logit
logit-= max_logit[:, None]
#initialize s
s = np.log(np.sum(np.exp(logit), axis=1)[:, None] + 1e-32)
#add max back to s
s+=max_logit[:, None]
return s
# raise NotImplementedError
#for undergraduate student
def normalPDF(self, logit, mu_i, sigma_i): #[5pts]
"""
Args:
logit: N x D numpy array
mu_i: 1xD numpy array (or array of lenth D), the center for the ith gaussian.
sigma_i: 1xDxD 3-D numpy array (or DxD 2-D numpy array), the covariance matrix of the ith gaussian.
Return:
pdf: 1xN numpy array (or array of length N), the probability distribution of N data for the ith gaussian
Hint:
np.diagonal() should be handy.
"""
raise NotImplementedError
#for grad students
def multinormalPDF(self, logits, mu_i, sigma_i): #[5pts]
"""
Args:
logit: N x D numpy array
mu_i: 1xD numpy array (or array of lenth D), the center for the ith gaussian.
sigma_i: 1xDxD 3-D numpy array (or DxD 2-D numpy array), the covariance matrix of the ith gaussian.
Return:
pdf: 1xN numpy array (or array of length N), the probability distribution of N data for the ith gaussian
Hint:
np.linalg.det() and np.linalg.inv() should be handy.
"""
N,D = logits.shape[0], logits.shape[1]
mu_i.reshape((D,1))
#term1
#adding very small value to sigma to avoid singular matrix error
t1 = np.dot((logits - mu_i.T[None, :]),np.linalg.inv(sigma_i+0.00001)).T
#term2
t2 = (logits - mu_i.T[None, :]).T
#element-wise multiplication
t3 = np.exp(np.sum(np.multiply(t1,t2), axis=0)*(-0.5))
#term 4
t4 = np.power(np.linalg.det(sigma_i), -0.5)
#term 5
t5 = np.power(np.power((2*np.pi), D/2), -1)
#putting it all together
pdf = t5*t4*t3
pdf.reshape((1,N))
return pdf
# raise NotImplementedError
def _init_components(self, **kwargs): #
"""
Args:
kwargs: any other arguments you want
Return:
pi: numpy array of length K, prior
mu: KxD numpy array, the center for each gaussian.
sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian.
"""
N, D, K = self.N, self.D, self.K
points = self.points
#initialize pi as same probability for each class
pi = np.array([1/K] * K)
#randomly select K observations as mean vectors
indices = np.random.choice(points.shape[0], size=K, replace=False)
# form the numpy array with these indices
mu = points[indices, :]
#sigma is np.eye for each K
sigma = np.array([np.eye(D)]*K)
return pi, mu, sigma
# raise NotImplementedError
def _ll_joint(self, pi, mu, sigma, **kwargs): #
"""
Args:
pi: np array of length K, the prior of each component
mu: KxD numpy array, the center for each gaussian.
sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian. You will have KxDxD numpy
array for full covariance matrix case
Return:
ll(log-likelihood): NxK array, where ll(i, k) = log pi(k) + log NormalPDF(points_i | mu[k], sigma[k])
"""
N, D, K = self.N, self.D, self.K
logits = self.points
# initialize ll
ll = np.zeros((N, K))
# term1
t1 = np.log(pi + 1e-32)
# add t1 to ll
ll = ll + t1
# term2
for i in range(K):
mu_i = mu[i, :]
sigma_i = sigma[i, :, :]
pdf = self.multinormalPDF(logits, mu_i, sigma_i)
pdf = np.nan_to_num(pdf)
ll[:, i] += np.log(pdf + 1e-32)
# np.nan_to_num(ll)
return ll
def _E_step(self, pi, mu, sigma, **kwargs): #
"""
Args:
pi: np array of length K, the prior of each component
mu: KxD numpy array, the center for each gaussian.
sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian.You will have KxDxD numpy
array for full covariance matrix case
Return:
gamma(tau): NxK array, the posterior distribution (a.k.a, the soft cluster assignment) for each observation.
Hint:
You should be able to do this with just a few lines of code by using _ll_joint() and softmax() defined above.
"""
logits = self.points
N, D, K = self.N, self.D, self.K
#initialize tau
tau = np.zeros((N, K))
for i in range(K):
mu_i = mu[i, :]
sigma_i = sigma[i, :, :]
pdf = self.multinormalPDF(logits, mu_i, sigma_i)
if np.isnan(pdf).any():
print('pdf has 0')
tau[:, i] = np.dot(pi[i], pdf).T
#term2, summation
t2 = np.sum(tau, axis=1)
tau = tau/t2[:, None]
return tau
# raise NotImplementedError
def _M_step(self, gamma, **kwargs): #
"""
Args:
gamma(tau): NxK array, the posterior distribution (a.k.a, the soft cluster assignment) for each observation.
Return:
pi: np array of length K, the prior of each component
mu: KxD numpy array, the center for each gaussian.
sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian. You will have KxDxD numpy
array for full covariance matrix case
Hint:
There are formulas in the slide and in the above description box.
"""
N_k = np.sum(gamma, axis=0)
pi, mu, sigma = self._init_components()
logits = self.points
N, D, K = self.N, self.D, self.K
#mu_k
mu_new = np.dot(gamma.T, logits)/ N_k.reshape(-1, 1)
#sigma_k
sigma_new = np.zeros((K, D, D))
for i in range(K):
#term1
t1 = (logits - mu_new[i]).T
#term2, element-wise multiplication
t2 = gamma[:, i] * t1
# t2 = np.multiply(gamma[:, i], t1)
#term3, dot product
t3 = np.dot(t2, (logits - mu_new[i]))
sigma_new[i] = t3/N_k[i]
#pi_k
pi_new = np.zeros((K))
pi_new = N_k/N
pi_new.reshape((K))
return pi_new, mu_new, sigma_new
# raise NotImplementedError
def __call__(self, abs_tol=1e-16, rel_tol=1e-16, **kwargs): # No need to change
"""
Args:
abs_tol: convergence criteria w.r.t absolute change of loss
rel_tol: convergence criteria w.r.t relative change of loss
kwargs: any additional arguments you want
Return:
gamma(tau): NxK array, the posterior distribution (a.k.a, the soft cluster assignment) for each observation.
(pi, mu, sigma): (1xK np array, KxD numpy array, KxDxD numpy array)
Hint:
You do not need to change it. For each iteration, we process E and M steps, then update the paramters.
"""
pi, mu, sigma = self._init_components(**kwargs)
if np.isnan(pi).any():
print('pi has 0')
if np.isnan(mu).any():
print('mu has 0')
if np.isnan(sigma).any():
print('sigma has 0')
pbar = tqdm(range(self.max_iters))
for it in pbar:
# E-step
gamma = self._E_step(pi, mu, sigma)
gamma = np.nan_to_num(gamma)
if np.isnan(gamma).any():
print('gamma has 0')
# M-step
pi, mu, sigma = self._M_step(gamma)
pi = np.nan_to_num(pi)
mu = np.nan_to_num(mu)
sigma = np.nan_to_num(sigma)
if np.isnan(pi).any():
print('pi_new has 0')
if np.isnan(mu).any():
print('mu_new has 0')
if np.isnan(sigma).any():
print('sigma_new has 0')
# calculate the negative log-likelihood of observation
joint_ll = self._ll_joint(pi, mu, sigma)
joint_ll = np.nan_to_num(joint_ll)
if np.isnan(joint_ll).any():
print('joint_ll has 0')
loss = -np.sum(self.logsumexp(joint_ll))
if it:
diff = np.abs(prev_loss - loss)
if diff < abs_tol and diff / prev_loss < rel_tol:
break
prev_loss = loss
pbar.set_description('iter %d, loss: %.4f' % (it, loss))
return gamma, (pi, mu, sigma)
3.3 Japanese art and pixel clustering [10pts + 5pts]
Ukiyo-e is a Japanese art genre predominant from the 17th through 19th centuries. In order to produce the intricate prints that came to represent the genre, artists carved wood blocks with the patterns for each color in a design. Paint would be applied to the block and later transfered to the print to form the image. In this section, you will use your GMM algorithm implementation to do pixel clustering and estimate how many wood blocks were likely used to produce a single print. That is to say, how many wood blocks would appropriatly produce the original paint. (Hint: you can justify your answer based on visual inspection of the resulting images or on a different metric of your choosing)
You do NOT need to submit your code for this question to the autograder. Instead you should include whatever images/information you find relevant in the report.
In [14]:
# helper function for performing pixel clustering. You don't have to modify it
def cluster_pixels_gmm(image, K):
"""Clusters pixels in the input image
Args:
image: input image of shape(H, W, 3)
K: number of components
Return:
clustered_img: image of shape(H, W, 3) after pixel clustering
"""
im_height, im_width, im_channel = image.shape
flat_img = np.reshape(image, [-1, im_channel]).astype(np.float32)
gamma, (pi, mu, sigma) = GMM(flat_img, K = K, max_iters = 100)()
cluster_ids = np.argmax(gamma, axis=1)
centers = mu
gmm_img = np.reshape(centers[cluster_ids], (im_height, im_width, im_channel))
return gmm_img
# helper function for plotting images. You don't have to modify it
def plot_images(img_list, title_list, figsize=(20, 10)):
assert len(img_list) == len(title_list)
fig, axes = plt.subplots(1, len(title_list), figsize=figsize)
for i, ax in enumerate(axes):
ax.imshow(img_list[i] / 255.0)
ax.set_title(title_list[i])
ax.axis('off')
In [15]:
# pick 2 of the images in this list:
url0 = 'https://upload.wikimedia.org/wikipedia/commons/b/b1/Utagawa_Kunisada_I_%28c._1832%29_Dawn_at_Futami-ga-ura.jpg'
url1 = 'https://upload.wikimedia.org/wikipedia/commons/9/95/Hokusai_%281828%29_Cuckoo_and_Azaleas.jpg'
url2 = 'https://upload.wikimedia.org/wikipedia/commons/7/74/Kitao_Shigemasa_%281777%29_Geisha_and_a_servant_carrying_her_shamisen_box.jpg'
url3 = 'https://upload.wikimedia.org/wikipedia/commons/1/10/Kuniyoshi_Utagawa%2C_Suikoden_Series_4.jpg'
# example of loading image from url0
image0 = imageio.imread(imageio.core.urlopen(url1).read())
image1 = imageio.imread(imageio.core.urlopen(url3).read())
# this is for you to implement
def find_n_woodblocks(image, min_clusters=5, max_clusters=15):
"""
Using the helper function above to find the optimal number of woodblocks that can appropriatly produce a single image.
You can simply examinate the answer based on your visual inspection (i.e. looking at the resulting images) or provide any metrics you prefer.
Args:
image: input image of shape(H, W, 3)
min_clusters, max_clusters: the minimum and maximum number of clusters you should test with. Default are 5a dn 15.
(Usually the maximum number of clusters would not exeed 15)
Return:
plot: comparison between original image and image pixel clustering.
optional: any other information/metric/plot you think is necessary.
"""
img_list = [image]
title_list = ['original image']
for num_clusters in range(min_clusters, max_clusters+1):
np.seterr(divide='ignore', invalid='ignore')
gmm_img = cluster_pixels_gmm(image, num_clusters)
img_list.append(gmm_img)
title_list.append('GMM #clusters = ' + str(num_clusters))
plot_images(img_list, title_list)
find_n_woodblocks(image0, 5, 8)
find_n_woodblocks(image1, 5, 8)
# raise NotImplementedError
iter 99, loss: 8833709.0903: 100%|██████████| 100/100 [01:35<00:00, 1.04it/s]
iter 99, loss: 8824229.8151: 100%|██████████| 100/100 [02:00<00:00, 1.20s/it]
iter 99, loss: 8755945.0012: 100%|██████████| 100/100 [02:30<00:00, 1.51s/it]
iter 99, loss: 8780598.6015: 100%|██████████| 100/100 [02:40<00:00, 1.60s/it]
iter 99, loss: 9686920.7530: 100%|██████████| 100/100 [01:35<00:00, 1.05it/s]
iter 99, loss: 9593794.9253: 100%|██████████| 100/100 [02:00<00:00, 1.21s/it]
iter 99, loss: 9608781.9818: 100%|██████████| 100/100 [02:47<00:00, 1.67s/it]
iter 99, loss: 9567262.3851: 100%|██████████| 100/100 [02:55<00:00, 1.76s/it]
For my experiments, I used the images 2 and 4, and ran the GMM clustering for various number of clusters. As we can see from the images above, using 8 clusters almost generates the original image. Hence, I estimate that the number if woodblocks used to produce each print were 8.
(Bonus for All) [5 pts]
Compare the full covariance matrix with the diagonal covariance matrix in GMM. Can you explain why the images are different with the same clusters? Note: You will have to implement both multinormalPDF and normalPDF, and add a few arguments in the original _ll_joint() and _Mstep() function to indicate which matrix you are using. You will earn full credit only if you implement both functions AND explain the reason.
In [ ]:
def compare_matrix(image, K):
"""
Args:
image: input image of shape(H, W, 3)
K: number of components
Return:
plot: comparison between full covariance matrix and diagonal covariance matrix.
"""
raise NotImplementedError
In [ ]:
compare_matrix(image1, 5)
4. (Bonus for Grad and Undergrad) A Wrench in the Machine [30pts]
Learning to work with messy data is a hallmark of a well-rounded data scientist. In most real-world settings the data given will usually have some issue, so it is important to learn skills to work around such impasses. This part of the assignment looks to expose you to clever ways to fix data using concepts that you have already learned in the prior questions.
The two solutions covered:
KNN Algorithm Approach
EM Algorithm Approach
Question
You are a consultant assigned to a company which refines raw materials. To refine the raw materials necessary for their operations, the company owns a vast fleet of machines. Stressing the importance of having minimum down time for refining, you have been tasked to find a way to predict whether a machine will need to be repaired or not. In order to aid you on the task, the company has supplied you with historical telemetric data from all of the machines. The features range from averages of temperature, frequencies, and other salient observations of the units. The specifics of the features are not pertinent to the classification; it can be assured that each feature is statistically significant. A unit is given a 1 if it is broken and a 0 otherwise.
However, due to a software bug in logging the telemetric data, 20% of the entries are missing labels and 30% are missing characterization data. Since simply removing the corrupted entries would not reflect the true variance of the data, your job is to implement a solution to clean the data so it can be properly classified.
Your job is to assist the company in cleaning their data and implementing a semi-supervised learning framework to help them create a general classifier.
You are given two files for this task:
telemetry_data.csv: the entire dataset with complete and incomplete data
validation_data.csv: a smaller, fully complete dataset made after the software bug had been fixed
4.1.a Data Cleaning
The first step is to break up the whole dataset into clear parts. All the data is randomly shuffled in one csv file. In order to move forward, the data needs to be split into three separate arrays:
labeled_complete: containing the complete characterization data and corresponding labels (broken = 1 and OK = 0)
labeled_incomplete: containing partial characterization data and corresponding labels (broken = 1 and OK = 0)
unlabeled_complete: containing only complete material characterization results
In [ ]:
def complete_(data):
"""
Args:
data: N x D numpy array
Return:
labeled_complete: n x D array where values contain both complete features and labels
"""
raise NotImplementedError
def incomplete_(data):
"""
Args:
data: N x D numpy array
Return:
labeled_incomplete: n x D array where values contain incomplete features but complete labels
"""
raise NotImplementedError
def unlabeled_(data):
"""
Args:
data: N x D numpy array
Return:
unlabeled_complete: n x D array where values contain complete features but incomplete labels
"""
raise NotImplementedError
4.1.b KNN [10pts]
The second step in this task is to clean the Labeled_incomplete dataset by filling in the missing values with probable ones derived from complete data. A useful approach to this type of problem is using a k-nearest neighbors (k-NN) algorithm. For this application, the method consists of replacing the missing value of a given point with the mean of the closest k-neighbors to that point.
In [ ]:
class CleanData(object):
def __init__(self): # No need to implement
pass
def pairwise_dist(self, x, y): # [0pts] - copy from kmeans
"""
Args:
x: N x D numpy array
y: M x D numpy array
Return:
dist: N x M array, where dist2[i, j] is the euclidean distance between
x[i, :] and y[j, :]
"""
#l2 norm, ord=2
dist = np.linalg.norm(x[:, np.newaxis, :] - y, ord=2, axis=2)
return dist
# raise NotImplementedError
def __call__(self, incomplete_points, complete_points, K, **kwargs): # [10pts]
"""
Args:
incomplete_points: N_incomplete x (D+1) numpy array, the incomplete labeled observations
complete_points: N_complete x (D+1) numpy array, the complete labeled observations
K: integer, corresponding to the number of nearest neighbors you want to base your calculation on
kwargs: any other args you want
Return:
clean_points: (N_incomplete + N_complete) x (D-1) X D numpy array of length K, containing both complete points and recently filled points
Hints: (1) You want to find the k-nearest neighbors within each class separately;
(2) There are missing values in all of the features. It might be more convenient to address each feature at a time.
"""
raise NotImplementedError
Below is a good expectation of what the process should look like on a toy dataset. If your output matches the answer below, you are on the right track.
In [ ]:
complete_data = np.array([[1.,2.,3.,1],[7.,8.,9.,0],[16.,17.,18.,1],[22.,23.,24.,0]])
incomplete_data = np.array([[1.,np.nan,3.,1],[7.,np.nan,9.,0],[np.nan,17.,18.,1],[np.nan,23.,24.,0]])
clean_data = CleanData()(incomplete_data, complete_data, 2)
print("*** Expected Answer - k = 2 ***")
print("""==complete data==
[[ 1. 5. 3. 1.]
[ 7. 8. 9. 0.]
[16. 17. 18. 1.]
[22. 23. 24. 0.]]
==incomplete data==
[[ 1. nan 3. 1.]
[ 7. nan 9. 0.]
[nan 17. 18. 1.]
[nan 23. 24. 0.]]
==clean_data==
[[ 1. 2. 3. 1. ]
[ 7. 8. 9. 0. ]
[16. 17. 18. 1. ]
[22. 23. 24. 0. ]
[14.5 23. 24. 0. ]
[ 7. 15.5 9. 0. ]
[ 8.5 17. 18. 1. ]
[ 1. 9.5 3. 1. ]]""")
print("\n*** My Answer - k = 2***")
print(clean_data)
4.2 Getting acquainted with semi-supervised learning approaches. [5pts]
You will implement a version of the algorithm presented in Table 1 of the paper "Text Classification from Labeled and Unlabeled Documents using EM" by Nigam et al. (2000). While you are recommended to read the whole paper this assignment focuses on items 1−−5.2 and 6.1. Write a brief summary of three interesting highlights of the paper (50-word maximum).
4.3 Implementing the EM algorithm. [10 pts]
In your implementation of the EM algorithm proposed by Nigam et al. (2000) on Table 1, you will use a Gaussian Naive Bayes (GNB) classifier as opposed to a naive Bayes (NB) classifier. (Hint: Using a GNB in place of an NB will enable you to reuse most of the implementation you developed for GMM in this assignment. In fact, you can successfully solve the problem by simply modifying the call method.)
In [ ]:
class SemiSupervised(object):
def __init__(self): # No need to implement
pass
def softmax(self,logits): # [0 pts] - can use same as for GMM
"""
Args:
logits: N x D numpy array
"""
raise NotImplementedError
def logsumexp(self,logits): # [0 pts] - can use same as for GMM
"""
Args:
logits: N x D numpy array
Return:
s: N x 1 array where s[i,0] = logsumexp(logits[i,:])
"""
raise NotImplementedError
def _init_components(self, points, K, **kwargs): # [5 pts] - modify from GMM
"""
Args:
points: Nx(D+1) numpy array, the observations
K: number of components
kwargs: any other args you want
Return:
pi: numpy array of length K, prior
mu: KxD numpy array, the center for each gaussian.
sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian.
Hint: The paper describes how you should initialize your algorithm.
"""
raise NotImplementedError
def _ll_joint(self, points, pi, mu, sigma, **kwargs): # [0 pts] - can use same as for GMM
"""
Args:
points: NxD numpy array, the observations
pi: np array of length K, the prior of each component
mu: KxD numpy array, the center for each gaussian.
sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian.
Return:
ll(log-likelihood): NxK array, where ll(i, j) = log pi(j) + log NormalPDF(points_i | mu[j], sigma[j])
Hint: Assume that the three properties of the lithium-ion batteries (multivariate gaussian) are independent.
This allows you to treat it as a product of univariate gaussians.
"""
raise NotImplementedError
def _E_step(self, points, pi, mu, sigma, **kwargs): # [0 pts] - can use same as for GMM
"""
Args:
points: NxD numpy array, the observations
pi: np array of length K, the prior of each component
mu: KxD numpy array, the center for each gaussian.
sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian.
Return:
gamma: NxK array, the posterior distribution (a.k.a, the soft cluster assignment) for each observation.
Hint: You should be able to do this with just a few lines of code by using _ll_joint() and softmax() defined above.
"""
raise NotImplementedError
def _M_step(self, points, gamma, **kwargs): # [0 pts] - can use same as for GMM
"""
Args:
points: NxD numpy array, the observations
gamma: NxK array, the posterior distribution (a.k.a, the soft cluster assignment) for each observation.
Return:
pi: np array of length K, the prior of each component
mu: KxD numpy array, the center for each gaussian.
sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian.
Hint: There are formulas in the slide.
"""
raise NotImplementedError
def __call__(self, points, K, max_iters=100, abs_tol=1e-16, rel_tol=1e-16, **kwargs): # [5 pts] - modify from GMM
"""
Args:
points: NxD numpy array, where N is # points and D is the dimensionality
K: number of clusters
max_iters: maximum number of iterations
abs_tol: convergence criteria w.r.t absolute change of loss
rel_tol: convergence criteria w.r.t relative change of loss
kwargs: any additional arguments you want
Return:
gamma: NxK array, the posterior distribution (a.k.a, the soft cluster assignment) for each observation.
(pi, mu, sigma): (1xK np array, KxD numpy array, KxD numpy array), mu and sigma.
"""
raise NotImplementedError
4.4 Demonstrating the performance of the algorithm. [5pts]
Compare the classification error based on the Gaussian Naive Bayes (GNB) classifier you implemented following the Nigam et al. (2000) approach to the performance of a GNB classifier trained using only labeled data. Since you have not covered supervised learning in class, you are allowed to use the scikit learn library for training the GNB classifier based only on labeled data: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html.
In [ ]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
class ComparePerformance(object):
def __init__(self): #No need to implement
pass
def accuracy_semi_supervised(self, points, independent, n=8):
"""
Args:
points: Nx(D+1) numpy array, where N is the number of points in the training set, D is the dimensionality, the last column
represents the labels (when available) or a flag that allows you to separate the unlabeled data.
independent: Nx(D+1) numpy array, where N is # points and D is the dimensionality and the last column are the correct labels
Return:
accuracy: floating number
"""
raise NotImplementedError
def accuracy_GNB_onlycomplete(self, points, independent, n=8):
"""
Args:
points: Nx(D+1) numpy array, where N is the number of only initially complete labeled points in the training set, D is the dimensionality, the last column
represents the labels.
independent: Nx(D+1) numpy array, where N is # points and D is the dimensionality and the last column are the correct labels
Return:
accuracy: floating number
"""
raise NotImplementedError
def accuracy_GNB_cleandata(self, points, independent, n=8):
"""
Args:
points: Nx(D+1) numpy array, where N is the number of clean labeled points in the training set, D is the dimensionality, the last column
represents the labels.
independent: Nx(D+1) numpy array, where N is # points and D is the dimensionality and the last column are the correct labels
Return:
accuracy: floating number
"""
raise NotImplementedError
In [ ]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
# Load and clean data for the next section
telemetry = np.loadtxt('data/telemetry.csv', delimiter=',')
labeled_complete = complete_(telemetry)
labeled_incomplete = incomplete_(telemetry)
unlabeled = unlabeled_(telemetry)
clean_data = CleanData()(labeled_incomplete, labeled_complete, 7)
# load unlabeled set
# append unlabeled flag
unlabeled_flag = -1*np.ones((unlabeled.shape[0],1))
unlabeled = np.concatenate((unlabeled, unlabeled_flag), 1)
unlabeled = np.delete(unlabeled, -1, axis=1)
# =========================================================================
# SEMI SUPERVISED
# format training data
points = np.concatenate((clean_data, unlabeled),0)
# train model
(pi, mu, sigma) = SemiSupervised()(points, 7)
# ==========================================================================
# COMPARISON
# load test data
independent = np.loadtxt('data/validation.csv', delimiter=',')
# classify test data
classification = SemiSupervised()._E_step(independent[:,:8], pi, mu, sigma)
classification = np.argmax(classification,axis=1)
# =========================================================================================
print("""===COMPARISON===""")
print("""SemiSupervised Accuracy:""", ComparePerformance().accuracy_semi_supervised(classification, independent))
print("""Supervised with clean data: GNB Accuracy:""", ComparePerformance().accuracy_GNB_onlycomplete(labeled_complete, independent))
print("""Supervised with only complete data: GNB Accuracy:""", ComparePerformance().accuracy_GNB_cleandata(clean_data, independent))