Starting from:

$30

CS7641-Homework 2 KMeans Clustering, Silhouette Coefficient Evaluation and EM Algorithm Solved

 KMeans Clustering 
KMeans is trying to solve the following optimization problem:

argminS∑i=1K∑xj∈Si||xj−μi||2arg⁡minS∑i=1K∑xj∈Si||xj−μi||2
where one needs to partition the N observations into K clusters: S={S1,S2,…,SK}S={S1,S2,…,SK} and each cluster has μiμi as its center.
 
1.1 pairwise distance 
In this section, you are asked to implement pairwise_dist function.

Given X∈RNxDX∈RNxD and Y∈RMxDY∈RMxD, obtain the pairwise distance matrix dist∈RNxMdist∈RNxM using the euclidean distance metric, where disti,j=||Xi−Yj||2disti,j=||Xi−Yj||2.

DO NOT USE FOR LOOP in your implementation -- they are slow and will make your code too slow to pass our grader. Use array broadcasting instead.
 
1.2 KMeans Implementation 
In this section, you are asked to implement _init_centers  _update_assignment , _update_centers [] and _get_loss function .

For the function signature, please see the corresponding doc strings.
 
1.3 Find the optimal number of clusters 
In this section, you are asked to implement find_optimal_num_clusters function.

You will now use the elbow method to find the optimal number of clusters.
 
1.4 Autograder test to find centers for data points 
To obtain these , you need to be pass the tests set up in the autograder. These will test the centers created by your implementation. Be sure to upload the correct files to obtain these points.
In [2]:
class KMeans(object):
    
    def __init__(self): #No need to implement
        pass
    
    def pairwise_dist(self, x, y): # [5 pts]
        """
        Args:
            x: N x D numpy array
            y: M x D numpy array
        Return:
                dist: N x M array, where dist2[i, j] is the euclidean distance between 
                x[i, :] and y[j, :]
                """
        #l2 norm, ord=2
        dist = np.linalg.norm(x[:, np.newaxis, :] - y, ord=2, axis=2)
        return dist
#         raise NotImplementedError

    def _init_centers(self, points, K, **kwargs): # 
        """
        Args:
            points: NxD numpy array, where N is # points and D is the dimensionality
            K: number of clusters
            kwargs: any additional arguments you want
        Return:
            centers: K x D numpy array, the centers. 
        """
        #K random indices from points
        indices = np.random.choice(points.shape[0], size=K, replace=False)
        #form the numpy array with these indices
        centers = points[indices,:]
        return centers
        
#         raise NotImplementedError

    def _update_assignment(self, centers, points): # 
        """
        Args:
            centers: KxD numpy array, where K is the number of clusters, and D is the dimension
            points: NxD numpy array, the observations
        Return:
            cluster_idx: numpy array of length N, the cluster assignment for each point
            
        Hint: You could call pairwise_dist() function.
        """
        #distances between points and all cluster centers
        distances = self.pairwise_dist(points,centers)
        #index of minimum distance for each row
        cluster_idx = np.argmin(distances,axis=1)
        return cluster_idx
#         raise NotImplementedError

    def _update_centers(self, old_centers, cluster_idx, points): # 
        """
        Args:
            old_centers: old centers KxD numpy array, where K is the number of clusters, and D is the dimension
            cluster_idx: numpy array of length N, the cluster assignment for each point
            points: NxD numpy array, the observations
        Return:
            centers: new centers, K x D numpy array, where K is the number of clusters, and D is the dimension.
        """
        K,D = old_centers.shape[0], old_centers.shape[1]
        #intialize centers as zero array
        centers = np.zeros((K,D))
        for i in range(K):
            #find mean of all points having i as cluster idx
            centers[i] = np.mean(points[cluster_idx==i, :], axis=0)
        return centers        
#         raise NotImplementedError

    def _get_loss(self, centers, cluster_idx, points): # 
        """
        Args:
            centers: KxD numpy array, where K is the number of clusters, and D is the dimension
            cluster_idx: numpy array of length N, the cluster assignment for each point
            points: NxD numpy array, the observations
        Return:
            loss: a single float number, which is the objective function of KMeans. 
        """
        #find squared distances between all points and cluster centers
        distances = np.linalg.norm(points[:, np.newaxis, :] - centers, ord=2, axis=2) ** 2
        #select distance from cluster center
        distance_from_cluster_center = distances[np.arange(len(distances)), cluster_idx]
        #loss is sum of all these distances
        loss = np.sum(distance_from_cluster_center)
        return loss
        
#         raise NotImplementedError
        
    def __call__(self, points, K, max_iters=100, abs_tol=1e-16, rel_tol=1e-16, verbose=False, **kwargs):
        """
        Args:
            points: NxD numpy array, where N is # points and D is the dimensionality
            K: number of clusters
            max_iters: maximum number of iterations (Hint: You could change it when debugging)
            abs_tol: convergence criteria w.r.t absolute change of loss
            rel_tol: convergence criteria w.r.t relative change of loss
            verbose: boolean to set whether method should print loss (Hint: helpful for debugging)
            kwargs: any additional arguments you want
        Return:
            cluster assignments: Nx1 int numpy array
            cluster centers: K x D numpy array, the centers
            loss: final loss value of the objective function of KMeans
        """
        centers = self._init_centers(points, K, **kwargs)
        for it in range(max_iters):
            cluster_idx = self._update_assignment(centers, points)
            centers = self._update_centers(centers, cluster_idx, points)
            loss = self._get_loss(centers, cluster_idx, points)
            K = centers.shape[0]
            if it:
                diff = np.abs(prev_loss - loss)
                if diff < abs_tol and diff / prev_loss < rel_tol:
                    break
            prev_loss = loss
            if verbose:
                print('iter %d, loss: %.4f' % (it, loss))
        return cluster_idx, centers, loss
    
    def find_optimal_num_clusters(self, data, max_K=15): # 
        """Plots loss values for different number of clusters in K-Means
        
        Args:
            image: input image of shape(H, W, 3)
            max_K: number of clusters
        Return:
            losses: an array of loss denoting the loss of each number of clusters
        """
        losses = []
        #for each value of K, find the loss and append to array
        for i in range(1,max_K+1):
            cluster_idx, centers, loss = self.__call__(data, i)
            losses.append(loss)
        return losses
        
#         raise NotImplementedError

In [3]:
# Helper function for checking the implementation of pairwise_distance fucntion. Please DO NOT change this function
# TEST CASE
x = np.random.randn(2, 2)
y = np.random.randn(3, 2)

print("*** Expected Answer ***")
print("""==x==
[[ 1.62434536 -0.61175641]
 [-0.52817175 -1.07296862]]
==y==
[[ 0.86540763 -2.3015387 ]
 [ 1.74481176 -0.7612069 ]
 [ 0.3190391  -0.24937038]]
==dist==
[[1.85239052 0.19195729 1.35467638]
 [1.85780729 2.29426447 1.18155842]]""")


print("\n*** My Answer ***")
print("==x==")
print(x)
print("==y==")
print(y)
print("==dist==")
print(KMeans().pairwise_dist(x, y))

 
*** Expected Answer ***
==x==
[[ 1.62434536 -0.61175641]
 [-0.52817175 -1.07296862]]
==y==
[[ 0.86540763 -2.3015387 ]
 [ 1.74481176 -0.7612069 ]
 [ 0.3190391  -0.24937038]]
==dist==
[[1.85239052 0.19195729 1.35467638]
 [1.85780729 2.29426447 1.18155842]]

*** My Answer ***
==x==
[[ 1.62434536 -0.61175641]
 [-0.52817175 -1.07296862]]
==y==
[[ 0.86540763 -2.3015387 ]
 [ 1.74481176 -0.7612069 ]
 [ 0.3190391  -0.24937038]]
==dist==
[[1.85239052 0.19195729 1.35467638]
 [1.85780729 2.29426447 1.18155842]]

In [4]:
def image_to_matrix(image_file, grays=False):
    """
    Convert .png image to matrix
    of values.
    params:
    image_file = str
    grays = Boolean
    returns:
    img = (color) np.ndarray[np.ndarray[np.ndarray[float]]]
    or (grayscale) np.ndarray[np.ndarray[float]]
    """
    img = plt.imread(image_file)
    # in case of transparency values
    if len(img.shape) == 3 and img.shape[2] 3:
        height, width, depth = img.shape
        new_img = np.zeros([height, width, 3])
        for r in range(height):
            for c in range(width):
                new_img[r, c, :] = img[r, c, 0:3]
        img = np.copy(new_img)
    if grays and len(img.shape) == 3:
        height, width = img.shape[0:2]
        new_img = np.zeros([height, width])
        for r in range(height):
            for c in range(width):
                new_img[r, c] = img[r, c, 0]
        img = new_img
    return img

In [5]:
image_values = image_to_matrix('./images/bird_color_24.png')

r = image_values.shape[0]
c = image_values.shape[1]
ch = image_values.shape[2]
# flatten the image_values
image_values = image_values.reshape(r*c,ch)

k = 6 # feel free to change this value
cluster_idx, centers, loss = KMeans()(image_values, k)
updated_image_values = np.copy(image_values)

# assign each pixel to cluster mean
for i in range(0,k):
    indices_current_cluster = np.where(cluster_idx == i)[0]
    updated_image_values[indices_current_cluster] = centers[i]
    
updated_image_values = updated_image_values.reshape(r,c,ch)

plt.figure(None,figsize=(9,12))
plt.imshow(updated_image_values)
plt.show()

 
 
In [6]:
KMeans().find_optimal_num_clusters(image_values)

Out[6]:
[25575.970952916756,
 14136.693323245287,
 10225.60985715669,
 6993.510281152104,
 6388.54308345205,
 4749.894457724676,
 3383.379286682485,
 3418.0366633949584,
 2738.2662250749277,
 2554.129469081243,
 2065.325028335666,
 1929.4098314621665,
 1965.2534688501296,
 1706.074071535149,
 1580.6903867544215]
 
Silhouette Coefficient Evaluation 
The average silhouette of the data is another useful criterion for assessing the natural number of clusters. The silhouette of a data instance is a measure of how closely it is matched to data within its cluster and how loosely it is matched to data of the neighbouring cluster.

The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. If most objects have a high value, then the clustering configuration is appropriate. If many points have a low or negative value, then the clustering configuration may have too many or too few clusters.
In [7]:
def intra_cluster_dist(cluster_idx, data, labels): # [4 pts]
    """
    Calculates the average distance from a point to other points within the same cluster
    
    Args:
        cluster_idx: the cluster index (label) for which we want to find the intra cluster distance
        data: NxD numpy array, where N is # points and D is the dimensionality
        labels: 1D array of length N where each number indicates of cluster assignement for that point
    Return:
        intra_dist_cluster: 1D array where the i_th entry denotes the average distance from point i 
                            in cluster denoted by cluster_idx to other points within the same cluster
    """
    # indices of rows with cluster==cluster_idx
    indices = [i for i in range(len(labels)) if labels[i]==cluster_idx]
    # find rows
    rows_of_interest = data[indices]
    # initialize intra_dist_cluster
    intra_dist_cluster = np.zeros((rows_of_interest.shape[0]))
    
    for i in range(intra_dist_cluster.shape[0]):
        x = np.concatenate((rows_of_interest[:i,:], rows_of_interest[i+1:,:]))
        y = rows_of_interest[i, :]
        intra_dist_cluster[i] = np.mean(np.sqrt(np.sum((x - y[None,:])**2, axis = 1)))
        
    return intra_dist_cluster
    #     raise NotImplementedError

def inter_cluster_dist(cluster_idx, data, labels): # 
    """
    Calculates the average distance from one cluster to the nearest cluster
    Args:
        cluster_idx: the cluster index (label) for which we want to find the intra cluster distance
        data: NxD numpy array, where N is # points and D is the dimensionality
        labels: 1D array of length N where each number indicates of cluster assignement for that point
    Return:
        inter_dist_cluster: 1D array where the i-th entry denotes the average distance from point i in cluster
                            denoted by cluster_idx to the nearest neighboring cluster
    """
    # find number of distinct clusters
    num_clusters = len(np.unique(labels))
    # separate data points by cluster
    separate_data = []
    for j in range(num_clusters):
        indices = [i for i in range(len(labels)) if labels[i] == j]
        separate_data.append(data[indices])

    # indices of rows with cluster==cluster_idx
    indices = [i for i in range(len(labels)) if labels[i] == cluster_idx]
    # find rows
    rows_of_interest = data[indices]

    # initialize inter_dist_cluster
    inter_dist_cluster = np.zeros((rows_of_interest.shape[0]))

    # find min. average distance from each point to nearest cluster
    for i in range(rows_of_interest.shape[0]):
        y = rows_of_interest[i, :]
        distances = []
        for j in range(len(separate_data)):
            if j != cluster_idx:
                x = separate_data[j]
                distances.append(np.mean(np.sqrt(np.sum((x - y[None, :]) ** 2, axis=1))))
        inter_dist_cluster[i] = min(distances)
    return inter_dist_cluster
    
#     raise NotImplementedError

def silhouette_coefficient(data, labels): #
    """
    Finds the silhouette coefficient of the current cluster assignment
    
    Args:
        data: NxD numpy array, where N is # points and D is the dimensionality
        labels: 1D array of length N where each number indicates of cluster assignement for that point
    Return:
        silhouette_coefficient: Silhouette coefficient of the current cluster assignment
    """
    #unique clusters
    clusters = np.unique(labels)
    #initialize silhouette coefficient
    SC = 0
    
    #for each value of cluster_idx, find inter and intra cluster distance, add SC(i) to SC
    for cluster_idx in clusters:
        intra_dist_cluster = intra_cluster_dist(cluster_idx, data, labels)
        inter_dist_cluster = inter_cluster_dist(cluster_idx, data, labels)
        #max of inter and intra cluster distance for each point
        max_of_both = np.max(np.array([intra_dist_cluster, inter_dist_cluster]), axis=0)
        #calculate SC of each point
        S = [x/y for x, y in zip(inter_dist_cluster - intra_dist_cluster, max_of_both)]
        #add sum of all SCs for that cluster to SC
        SC+= np.sum(S)
        
    #SC is average of sum of all individual SCs
    SC/= data.shape[0]
    
    return SC
        
        
#     raise NotImplementedError

In [8]:
def plot_silhouette_coefficient(data, max_K=15):
    """
    Plot silhouette coefficient for different number of clusters, no need to implement
    """
    clusters = np.arange(2, max_K+1)
    
    silhouette_coefficients = []
    for k in range(2, max_K+1):
        labels, _, _ = KMeans()(data, k)
        silhouette_coefficients.append(silhouette_coefficient(data, labels))
    plt.plot(clusters, silhouette_coefficients)
    return silhouette_coefficients


data = np.random.rand(200,3) * 100
plot_silhouette_coefficient(data)

Out[8]:
[0.27434894586234465,
 0.23974317309219537,
 0.25170126787277375,
 0.29110188891166133,
 0.32392134092333413,
 0.2978439345698108,
 0.24118691265277048,
 0.2861673492358512,
 0.27509945216139964,
 0.2710429481302329,
 0.27049817823622974,
 0.28742447415210476,
 0.28737886551571834,
 0.2721522663631721]
 
 
 
Limitation of K-Means
One of the limitations of K-Means Clustering is that it dependes largely on the shape of the dataset. A common example of this is trying to cluster one circle within another (concentric circles). A K-means classifier will fail to do this and will end up effectively drawing a line which crosses the circles. You can visualize this limitation in the cell below.
In [9]:
# visualize limitation of kmeans, do not have to implement
from sklearn.datasets.samples_generator import (make_circles, make_moons)

X1, y1 = make_circles(factor=0.5, noise=0.05, n_samples=1500)
X2, y2 = make_moons(noise=0.05, n_samples=1500)

def visualise(X, C, K):# Visualization of clustering. You don't need to change this function   
    fig, ax = plt.subplots()
    ax.scatter(X[:, 0], X[:, 1], c=C,cmap='rainbow')
    plt.title('Visualization of K = '+str(K), fontsize=15)
    plt.show()
    pass

cluster_idx1, centers1, loss1 = KMeans()(X1, 2)
visualise(X1, cluster_idx1, 2)

cluster_idx2, centers2, loss2 = KMeans()(X2, 2)
visualise(X2, cluster_idx2, 2)

 
 
 
 
 
2. EM algorithm
 
2.1 Performing EM Update 
A univariate Gaussian Mixture Model (GMM) has two components, both of which have their own mean and standard deviation. The model is defined by the following parameters:

z∼Bernoulli(θ)z∼Bernoulli(θ)
p(x|z=0)∼N(μ,σ)p(x|z=0)∼N(μ,σ)
p(x|z=1)∼N(2μ,3σ)p(x|z=1)∼N(2μ,3σ)
For a dataset of N datapoints, find the following:

2.1.1. Write the marginal probability of x, i.e. p(x)p(x) 

2.1.2. E-Step: Compute the posterior probability, i.e, p(zi=k|xi)p(zi=k|xi), where k = {0,1} 

2.1.3. M-Step: Compute the updated value of μμ (You can keep σσ fixed for this) [3pts]

2.1.4. M-Step: Compute the updated value for σσ (You can keep μμ fixed for this) [3pts]
 
2.1.1 Marginal probability of x, i.e. p(x)p(x)
Given: A univariate Gaussian mixture model with two components, defined by their parameters as:

z∼Bernoulli(θ)z∼Bernoulli(θ)
p(x|z=0)∼N(μ,σ)p(x|z=0)∼N(μ,σ)
p(x|z=1)∼N(2μ,3σ)p(x|z=1)∼N(2μ,3σ)
Note: Considering p(z=0)=θp(z=0)=θ, and σσ and 3σ3σ as standard deviation as suggested by TAs on piazza:


Marginal probability of x can be calculated as:

p(x)=∑zp(z)∗p(x|z)=p(z=0)∗p(x|z=0)+p(z=1)∗p(x|z=1)=θ∗N(μ,σ)+(1−θ)∗N(2μ,3σ)p(x)=∑zp(z)∗p(x|z)=p(z=0)∗p(x|z=0)+p(z=1)∗p(x|z=1)=θ∗N(μ,σ)+(1−θ)∗N(2μ,3σ)
 
2.1.2 E-Step: Posterior probability, i.e, p(zi=k|xi)p(zi=k|xi), where k = {0,1}
For k=0, we get:


p(zi=0|xi)=p(zi=0)∗p(xi|zi=0)p(x)=θ∗N(μ,σ)θ∗N(μ,σ)+(1−θ)∗N(2μ,3σ)p(zi=0|xi)=p(zi=0)∗p(xi|zi=0)p(x)=θ∗N(μ,σ)θ∗N(μ,σ)+(1−θ)∗N(2μ,3σ)

Similarly, for k=0, we get:


p(zi=1|xi)=p(zi=1)∗p(xi|zi=1)p(x)=(1−θ)∗N(2μ,3σ)θ∗N(μ,σ)+(1−θ)∗N(2μ,3σ)p(zi=1|xi)=p(zi=1)∗p(xi|zi=1)p(x)=(1−θ)∗N(2μ,3σ)θ∗N(μ,σ)+(1−θ)∗N(2μ,3σ)
 
2.1.3 M-Step: Computing the updated value of μμ
To find the updated value of μμ, we need to solve the optimization problem:

θnew=argmaxθ∑zp(Z|X,θold)∗ln(p(X,Z|θ)θnew=argmaxθ∑zp(Z|X,θold)∗ln⁡(p(X,Z|θ)
To solve this optimization problem, we will find its derivative and equate it to 0.


Representing p(Z=1|X,θoldp(Z=1|X,θold as τ(z=1)τ(z=1) and p(Z=0|X,θoldp(Z=0|X,θold as τ(z=0)τ(z=0) and using the value of the derivative of the Gaussian pdf w.r.t. μμ, we get:

∂∂μ∑zp(Z|X,θold)∗ln(p(X,Z|θ)=∂∂μ[τ(z=1)∗ln((1−θ)∗N(2μ,3σ))+τ(z=0)∗ln(θ∗N(μ,σ))]=τ(z=1)∗∂N(2μ,3σ)∂μ+τ(z=0)∗∂N(μ,σ)∂μ=τ(z=1)∗2(x−2μ)9σ2+τ(z=0)∗(x−μ)σ2∂∂μ∑zp(Z|X,θold)∗ln⁡(p(X,Z|θ)=∂∂μ[τ(z=1)∗ln⁡((1−θ)∗N(2μ,3σ))+τ(z=0)∗ln⁡(θ∗N(μ,σ))]=τ(z=1)∗∂N(2μ,3σ)∂μ+τ(z=0)∗∂N(μ,σ)∂μ=τ(z=1)∗2(x−2μ)9σ2+τ(z=0)∗(x−μ)σ2
Equating the above value of derivative to 0, we get:

τ(z=1)∗2(x−2μ)9σ2+τ(z=0)∗(x−μ)σ2=0τ(z=1)∗2(x−2μ)9σ2+τ(z=0)∗(x−μ)σ2=0
2x∗τ(z=1)−4μ∗τ(z=1)+9x∗τ(z=0)−9μ∗τ(z=0)=02x∗τ(z=1)−4μ∗τ(z=1)+9x∗τ(z=0)−9μ∗τ(z=0)=0
x(2∗τ(z=1)+9∗τ(z=0))=μ(9∗τ(z=0)+4∗τ(z=1))x(2∗τ(z=1)+9∗τ(z=0))=μ(9∗τ(z=0)+4∗τ(z=1))

This gives us the updated value of μμ as:

μ=x(2τ(z=1)+9τ(z=0))9τ(z=0)+4τ(z=1)μ=x(2τ(z=1)+9τ(z=0))9τ(z=0)+4τ(z=1)
 
2.1.4 M-Step: Computing the updated value for σσ:


For this part, we are solving the same optimization problem, but this time, we will find the partial derivative w.r.t. σσ.


Representing p(Z=1|X,θoldp(Z=1|X,θold as τ(z=1)τ(z=1) and p(Z=0|X,θoldp(Z=0|X,θold as τ(z=0)τ(z=0) and using the value of the derivative of the Gaussian pdf w.r.t. σσ, we get:


∂∂σ∑zp(Z|X,θold)∗ln(p(X,Z|θ)=∂∂σ[τ(z=1)∗ln((1−θ)∗N(2μ,3σ))+τ(z=0)∗ln(θ∗N(μ,σ))]=τ(z=1)∗∂N(2μ,3σ)∂σ+τ(z=0)∗∂N(μ,σ)∂σ=τ(z=1)∗(x−2μ)2−9σ29σ3+τ(z=0)∗(x−μ)2−σ2σ3∂∂σ∑zp(Z|X,θold)∗ln⁡(p(X,Z|θ)=∂∂σ[τ(z=1)∗ln⁡((1−θ)∗N(2μ,3σ))+τ(z=0)∗ln⁡(θ∗N(μ,σ))]=τ(z=1)∗∂N(2μ,3σ)∂σ+τ(z=0)∗∂N(μ,σ)∂σ=τ(z=1)∗(x−2μ)2−9σ29σ3+τ(z=0)∗(x−μ)2−σ2σ3
Equating the above value of derivative to 0, we get:


τ(z=1)∗(x−2μ)2−9σ29σ3+τ(z=0)∗(x−μ)2−σ2σ3=0τ(z=1)∗(x−2μ)2−9σ29σ3+τ(z=0)∗(x−μ)2−σ2σ3=0
\

τ(z=1)∗((x−2μ)2−9σ2)+9τ(z=0)∗((x−μ)2−σ2)=0τ(z=1)∗((x−2μ)2−9σ2)+9τ(z=0)∗((x−μ)2−σ2)=0
τ(z=1)(x−2μ)2−9τ(z=1)∗σ2+9τ(z=0)(x−μ)2−9τ(z=0)∗σ2=0τ(z=1)(x−2μ)2−9τ(z=1)∗σ2+9τ(z=0)(x−μ)2−9τ(z=0)∗σ2=0
σ2=τ(z=1)(x−2μ)2+9τ(z=0)(x−μ)29(τ(z=1)+τ(z=0))σ2=τ(z=1)(x−2μ)2+9τ(z=0)(x−μ)29(τ(z=1)+τ(z=0))
This gives us the updated value of σσ as:


σ=13τ(z=1)(x−2μ)2+9τ(z=0)(x−μ)2τ(z=1)+τ(z=0)−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−√σ=13τ(z=1)(x−2μ)2+9τ(z=0)(x−μ)2τ(z=1)+τ(z=0)
 
2.2 EM Algorithm in ABO Blood Groups 
In the ABO blood group system, each individual has a phenotype and a genotype as shown below. The genotype is made of underlying alleles (A, B, O).

PhenotypeAAABBBOABGenotypeAAAOOABBBOOBOOABPhenotypeGenotypeAAAAAOAOABBBBBOBOBOOOABAB
In a research experiment, scientists wanted to model the distribution of the genotypes of the population. They collected the phenotype information from the participants as this could be directly observed from the individual's blood group. The scientists, however want to use this data to model the underlying genotype information. In order to help them obtain an understanding, you suggest using the EM algorithm to find out the genotype distribution.

You know that the probability of that an allele is present in an individual is independent of the probability of any other allele, i.e, P(AO)=P(OA)=P(A)∗P(O)P(AO)=P(OA)=P(A)∗P(O) and so on. Also note that the genotype pairs: (AO, OA) and (BO, OB) are identical and can be treated as AO, BO respectively. You also know that the alleles follow a multinomial distribution.

p(O)=1−p(A)−p(B)p(O)=1−p(A)−p(B)
Let nA,nB,nO,nABnA,nB,nO,nAB be the number of individuals with the phenotypes A, B, O and AB respectively.\ Let nAA,nAO,nBB,nBO,nABnAA,nAO,nBB,nBO,nAB be the numbers of individuals with genotypes AA, AO, BB, BO and AB respectively.\ The satisfy the following conditions:

nA=nAA+nAOnA=nAA+nAO
nB=nBB+nBOnB=nBB+nBO
nA+nB+nO+nAB=nnA+nB+nO+nAB=n
Given:

pA=pB=pO=13pA=pB=pO=13
nA=186,nB=38,nO=284,nAB=13nA=186,nB=38,nO=284,nAB=13
2.2.1. In the E step, compute the value of nAA,nAO,nBB,nBOnAA,nAO,nBB,nBO. [5pts]

2.2.2. In the M step, find the new value of pA,pBpA,pB given the updated values from E-step above. (Round off the answer to 3 decimal places) [5pts]
 
2.2.1 We have:


pAA=p2A=(13)2=19pAA=pA2=(13)2=19
pAO=2pApO=2(13)2=29pAO=2pApO=2(13)2=29
pBB=p2B=(13)2=19pBB=pB2=(13)2=19
pBO=2pBpO=2(13)2=29pBO=2pBpO=2(13)2=29
pOO=p2O=(13)2=19pOO=pO2=(13)2=19
pAB=2pApB=(13)2=29pAB=2pApB=(13)2=29
We can calculate:

nAA=nA∗pAApAA+pAO=186∗19∗119+29=186∗19∗93=1863=62nAA=nA∗pAApAA+pAO=186∗19∗119+29=186∗19∗93=1863=62
nAO=nA∗pAOpAA+pAO=186∗29∗119+29=186∗29∗93=186∗23=124nAO=nA∗pAOpAA+pAO=186∗29∗119+29=186∗29∗93=186∗23=124
nBB=nB∗pBBpBB+pBO=38∗19∗119+29=38∗19∗93=383≈13nBB=nB∗pBBpBB+pBO=38∗19∗119+29=38∗19∗93=383≈13
nBO=nB∗pBOpBB+pBO=38∗29∗119+29=38∗29∗93=38∗23≈25nBO=nB∗pBOpBB+pBO=38∗29∗119+29=38∗29∗93=38∗23≈25
 
2.2.2 For the M-step, we will solve an optimization problem, given the constraint pA+pB+pO=1pA+pB+pO=1.
We have, the log-likelihhod function:

LL(pA,pB,pC)nBO∗logpBO+nOO∗logpOO+nAB∗logpAB=log(pnAAAA+pnAOAO+pnBBBB+pnBOBO+pnOOOO+pnABAB)=nAA∗logpAA+nAO∗logpAO+nBB∗logpBB+LL(pA,pB,pC)=log⁡(pAAnAA+pAOnAO+pBBnBB+pBOnBO+pOOnOO+pABnAB)=nAA∗log⁡pAA+nAO∗log⁡pAO+nBB∗log⁡pBB+nBO∗log⁡pBO+nOO∗log⁡pOO+nAB∗log⁡pAB
Thus, the Lagrange function would be:

L(pA,pB,pC)=LL(pA,pB,pC)−λ(pA+pB+pO−1)L(pA,pB,pC)=LL(pA,pB,pC)−λ(pA+pB+pO−1)
To solve the optimization problems, we will find the partial derivatives and equate it to 0.

∂L(pA,pB,pC)∂pA=∂∂pA[nAA∗logpAA+nAO∗logpAO+nAB∗logpAB−λ(pA+pB+pO−1)]=2nAApA+2nAOpO2pApO+2nABpB2pApB−λ=2nAA+nAO+nABpA−λ=0∂L(pA,pB,pC)∂pA=∂∂pA[nAA∗log⁡pAA+nAO∗log⁡pAO+nAB∗log⁡pAB−λ(pA+pB+pO−1)]=2nAApA+2nAOpO2pApO+2nABpB2pApB−λ=2nAA+nAO+nABpA−λ=0



∂L(pA,pB,pC)∂pB=∂∂pB[nBB∗logpBB+nBO∗logpBO+nAB∗logpAB−λ(pA+pB+pO−1)]=2nBBpB+2nBOpO2pBpO+2nABpB2pApB−λ=2nBB+nBO+nABpB−λ=0∂L(pA,pB,pC)∂pB=∂∂pB[nBB∗log⁡pBB+nBO∗log⁡pBO+nAB∗log⁡pAB−λ(pA+pB+pO−1)]=2nBBpB+2nBOpO2pBpO+2nABpB2pApB−λ=2nBB+nBO+nABpB−λ=0



∂L(pA,pB,pC)∂pO=∂∂pO[nOO∗logpOO+nAO∗logpAO+nBO∗logpBO−λ(pA+pB+pO−1)]=2nOOpO+2nAOpO2pApO+2nBOpB2pOpB−λ=2nOO+nBO+nAOpO−λ=0∂L(pA,pB,pC)∂pO=∂∂pO[nOO∗log⁡pOO+nAO∗log⁡pAO+nBO∗log⁡pBO−λ(pA+pB+pO−1)]=2nOOpO+2nAOpO2pApO+2nBOpB2pOpB−λ=2nOO+nBO+nAOpO−λ=0



∂L(pA,pB,pC)∂λ=∂∂λ[−λ(pA+pB+pO−1)]=(pA+pB+pO−1)=0∂L(pA,pB,pC)∂λ=∂∂λ[−λ(pA+pB+pO−1)]=(pA+pB+pO−1)=0
Adding the partial derivatives of L(pA,pB,pC)L(pA,pB,pC) w.r.t. pApA, pBpB and pCpC, we get:


2(nAA+nAO+nBB+nBO+nOO+nAB)−λ(pA+pB+pO)=02(nAA+nAO+nBB+nBO+nOO+nAB)−λ(pA+pB+pO)=0
Since the sum of all frequencies is nn and the sum of all probabilities is 1, we get:

2n−λ=02n−λ=0
λ=2n=2∗(186+38+284+13)=2∗521=1042λ=2n=2∗(186+38+284+13)=2∗521=1042
Substituting the value of λλ in the partial derivatives of L(pA,pB,pC)L(pA,pB,pC) w.r.t. pApA and pBpB, we get:


2nAA+nAO+nABpA−λ=02nAA+nAO+nABpA−λ=0



pA=2nAA+nAO+nABλ=2∗62+124+131042=2611042≈0.25pA=2nAA+nAO+nABλ=2∗62+124+131042=2611042≈0.25
Now, for pBpB,


2nBB+nBO+nABpB−λ=02nBB+nBO+nABpB−λ=0
pB=2nBB+nBO+nABλ=2∗13+25+131042=641042≈0.061pB=2nBB+nBO+nABλ=2∗13+25+131042=641042≈0.061
 
3. GMM implementation [40 + 10 + 5(bonus) pts]
A Gaussian Mixture Model(GMM) is a probabilistic model that assumes all the data points are generated from a mixture of a finite number of Gaussian Distribution. In a nutshell, GMM is a soft clustering algorithm in a sense that each data point is assigned to a cluster with a probability. In order to do that, we need to convert our clustering problem into an inference problem.

Given NN samples X=[x1,x2,…,xN]TX=[x1,x2,…,xN]T, where xi∈RDxi∈RD. Let ππ be a K-dimentional probability distribution and (μk;Σk)(μk;Σk) be the mean and covariance matrix of the kthkth Gaussian distribution in RdRd.

The GMM object implements EM algorithms for fitting the model and MLE for optimizing its parameters. It also has some particular hypothesis on how the data was generated:

Each data point xixi is assigned to a cluster kk with probability of πkπk where ∑Kk=1πk=1∑k=1Kπk=1
Each data point xixi is generated from Multivariate Normal Distribution N(μk,Σk)N(μk,Σk) where μk∈RDμk∈RD and Σk∈RD×DΣk∈RD×D
Our goal is to find a KK-dimension Gaussian distributions to model our data XX. This can be done by learning the parameters π,μπ,μ and ΣΣ through likelihood function. Detailed derivation can be found in our slide of GMM. The log-likelihood function now becomes:

ln p(x1,…,xN|π,μ,Σ)=∑i=1Nln (∑k=1Kπ(k)N(xi|μk,Σk))ln p(x1,…,xN|π,μ,Σ)=∑i=1Nln (∑k=1Kπ(k)N(xi|μk,Σk))
 
From the lecture we know that MLEs for GMM all depend on each other and the responsibility ττ. Thus, we need to use an iterative algorithm (the EM algorithm) to find the estimate of parameters that maximize our likelihood function. All detailed derivations can be found in the lecture slide of GMM.

E-step: Evaluate the responsibilities
In this step, we need to calculate the responsibility ττ, which is the conditional probability that a data point belongs to a specific cluster kk if we are given the datapoint, i.e. P(zk|x)P(zk|x). The formula for ττ is given below:

τ(zk)=πkN(x|μk,Σk)∑Kj=1πjN(x|μj,Σj),for k=1,…,Kτ(zk)=πkN(x|μk,Σk)∑j=1KπjN(x|μj,Σj),for k=1,…,K
Note that each data point should have one probability for each component/cluster. For this homework, you will work with τ(zk)τ(zk) which has a size of N×KN×K and you should have all the responsibility values in one matrix. We use gamma as ττ in this homework.

M-step: Re-estimate Paramaters
After we obtained the responsibility, we can find the update of parameters, which are given below:

μnewkΣnewkπnewk=∑Nn=1τ(zk)xnNk=1Nk∑n=1Nτ(zk)T(xn−μnewk)T(xn−μnewk)=NkNμknew=∑n=1Nτ(zk)xnNkΣknew=1Nk∑n=1Nτ(zk)T(xn−μknew)T(xn−μknew)πknew=NkN
where Nk=∑Nn=1τ(zk)Nk=∑n=1Nτ(zk). Note that the updated value for μkμk is used when updating ΣkΣk. The multiplication of τ(zk)T(xn−μnewk)Tτ(zk)T(xn−μknew)T is element-wise so it will preserve the dimensions of (xn−μnewk)T(xn−μknew)T.

We repeat E and M steps until the incremental improvement to the likelihood function is small.
 
Special Notes

For undergraduate students: you may assume that the covariance matrix ΣΣ is a diagonal matrix, which means the features are independent. (i.e. the red intensity of a pixel is independent of its blue intensity, etc).
For graduate students: please assume a full covariance matrix.
The class notes assume that your dataset XX is (D,N)(D,N). However, the homework dataset is (N,D)(N,D) as mentioned on the instructions, so the formula is a little different from the lecture note in order to obtain the right dimensions of parameters.
Hints

DO NOT USE FOR LOOPS OVER N. You can always find a way to avoid looping over the observation data points in our homework problem. If you have to loop over D or K, that would be fine.
You can initiate π(k)π(k) the same for each kk, i.e. π(k)=1K,∀k=1,2,…,Kπ(k)=1K,∀k=1,2,…,K.
In part 3 you are asked to generate the model for pixel clustering of image. We will need to use a multivariate Gaussian because each image will have NN pixels and D=3D=3 features, which correspond to red, green, and blue color intensities. It means that each image is a (N×3)(N×3) dataset matrix. In the following parts, remember D=3D=3 in this problem.
To avoid using for loops in your code, we recommend you take a look at the concept Array Broadcasting in Numpy. Also, some calculations that required different shapes of arrays can be achieved by broadcasting.
Be careful of the dimensions of your parameters. Before you test anything on the autograder, please look at the instructions below on the shapes of the variables you need to output. This could enhance the functionality of your code and help you debug. Also notice that a numpy array in shape (N,1)(N,1) is NOT the same as that in shape (N,)(N,) so be careful and consistent on what you are using. You can see the detailed explanation here. Difference between numpy.array shape (R, 1) and (R,)

The dataset XX: (N,D)(N,D)
μμ: (K,D)(K,D).
ΣΣ: (K,D,D)(K,D,D)
ττ: (N,K)(N,K)
ππ: array of length KK
ll_joint: (N,K)(N,K)
 
3.1 Helper functions 
To facilitate some of the operations in the GMM implementation, we would like you to implement the following three helper functions. In these functions, "logit" refers to an input array of size (N,D)(N,D). Remember the goal of helper functions is to facilitate our calculation so DO NOT USE FOR LOOP ON N.

3.1.1. softmax 
Given logit∈RN×Dlogit∈RN×D, calculate prob∈RN×Dprob∈RN×D, where probi,j=exp(logiti,j)∑Dd=1exp(logiti,d)probi,j=exp⁡(logiti,j)∑d=1Dexp(logiti,d).

Note: it is possible that logiti,jlogiti,j is very large, making exp(⋅)exp⁡(⋅) of it to explode. To make sure it is numerically stable, you need to subtract the maximum for each row of logitslogits, and then add it back in your result.

3.1.2. logsumexp 
Given logit∈RN×Dlogit∈RN×D, calculate s∈RNs∈RN, where si=log(∑Dj=1exp(logiti,j))si=log⁡(∑j=1Dexp⁡(logiti,j)). Again, pay attention to the numerical problem. You may want to use similar trick as in the softmax function. Note: This function is used in the call() function which is given, so you will not need it in your own implementation. It helps calculate the loss of log-likehood.
 
3.1.3. Multivariate Gaussian PDF 
You should be able to write your own function based on the following formula, and you are NOT allowed to use outside resource packages other than those we provided.

(for undergrads only) normalPDF

Using the covariance matrix as a diagonal matrix with variances of the individual variables appearing on the main diagonal of the matrix and zeros everywhere else means that we assume the features are independent. In this case, the multivariate normal density function simplifies to the expression below:

N(x:μ,Σ)=∏i=1D12πσ2i−−−−√exp(−12σ2i(xi−μi)2)N(x:μ,Σ)=∏i=1D12πσi2exp⁡(−12σi2(xi−μi)2)
where σ2iσi2 is the variance for the ithith feature, which is the diagonal element of the covariance matrix.

(for grads only) multinormalPDF

Given the dataset X∈RN×DX∈RN×D, the mean vector μ∈RDμ∈RD and covariance matrix Σ∈RD×DΣ∈RD×D for a multivariate Gaussian distrubution, calculate the probability p∈RNp∈RN of each data. The PDF is given by

N(X:μ,Σ)=1(2π)D/2|Σ|−1/2exp(−12(X−μ)Σ−1(X−μ)T)N(X:μ,Σ)=1(2π)D/2|Σ|−1/2exp⁡(−12(X−μ)Σ−1(X−μ)T)
where |Σ||Σ| is the determinant of the covariance matrix.

Hints

If you encounter "LinAlgError", you can mitigate your number/array by summing a small value before taking the operation, e.g. np.linalg.inv($\Sigma_k$ + 1e-32). You can arrest and handle such error by using Try and Exception Block in Python.
In the above calculation, you must avoid computing a (N,N)(N,N) matrix. Using the above equation for large N will crash your kernel and/or give you a memory error on Gradescope. Instead, you can do this same operation by calculating (X−μ)Σ−1(X−μ)Σ−1, a (N,D)(N,D) matrix, transpose it to be a (D,N)(D,N) matrix and do an element-wise multiplication with (X−μ)T(X−μ)T, which is also a (D,N)(D,N) matrix. Lastly, you will need to sum over the 0 axis to get a (1,N)(1,N) matrix before proceeding with the rest of the calculation. This uses the fact that doing an element-wise multiplication and summing over the 0 axis is the same as taking the diagonal of the (N,N)(N,N) matrix from the matrix multiplication.
In Numpy implementation for μμ, you can either use a 2-D array with dimension (1,D)(1,D) for each Gaussian Distribution, or a 1-D array with length DD. Same to other array parameters. Both ways should be acceptable but pay attention to the shape mismatch problem and be consistent all the time when you implement such arrays.
 
3.2 GMM Implementation 
Things to do in this problem:

3.2.1. Initialize parameters in _init_components() [5 pts]
Examples of how you can initialize the parameters.

Set the prior probability ππ the same for each class.
Initialize μμ by randomly selecting K numbers of observations as the initial mean vectors, and initialize the covariance matrix with np.eye() for each k. For grads, you can also initialize the ΣΣ by K diagonal matrices. It will become a full matrix after one iteration, as long as you adopt the correct computation.
Other ways of initialization are acceptable and welcome.
3.2.2. Formulate the log-likelihood function _ll_joint() 
The log-likelihood function is given by:

ℓ(θ)=∑i=1Nln (∑k=1Kπ(k)N(xi|μk,Σk))ℓ(θ)=∑i=1Nln (∑k=1Kπ(k)N(xi|μk,Σk))
In this part, we will generate a (N,K)(N,K) matrix where each datapoint xi,∀i=1,…,Nxi,∀i=1,…,N has KK log-likelihood numbers. Thus, for each i=1,…,Ni=1,…,N and k=1,…,Kk=1,…,K,

log-likelihood[i,k]=logπk+logN(xi|μk,Σk)log-likelihood[i,k]=log⁡πk+log⁡N(xi|μk,Σk)
Hints:

If you encounter "ZeroDivisionError" or "RuntimeWarning: divide by zero encountered in log", you can mitigate your number/array by summing a small value before taking the operation, e.g. np.log($\pi_k$ + 1e-32).
You need to use the Multivariate Normal PDF function you created in the last part. Remember the PDF function is for each Gaussian Distribution (i.e. for each k) so you need to use a for loop over K.
 
3.2.3. Setup Iterative steps for EM Algorithm 
You can find the detail instruction in the above description box.

Hints:

For E steps, we already get the log-likelihood at _ll_joint() function. This is not the same as responsibilities (ττ), but you should be able to finish this part with just a few lines of code by using _ll_joint() and softmax() defined above.
For undergrads: Try to simplify your calculation for ΣΣ in M steps as you assumed independent components. Make sure you are only taking the diagonal terms of your calculated covariance matrix.
In [13]:
class GMM(object):
    def __init__(self, X, K, max_iters = 100): # No need to change
        """
        Args: 
            X: the observations/datapoints, N x D numpy array
            K: number of clusters/components
            max_iters: maximum number of iterations (used in EM implementation)
        """
        self.points = X
        self.max_iters = max_iters
        
        self.N = self.points.shape[0]        #number of observations
        self.D = self.points.shape[1]        #number of features
        self.K = K                           #number of components/clusters

    #Helper function for you to implement
    def softmax(self, logit): # [5pts]
        """
        Args:
            logit: N x D numpy array
        Return:
            prob: N x D numpy array. See the above function.
        """
        #max of each row in logit
        max_logit = np.max(logit, axis=1)
        #subtract max of each row from logit
        logit-= max_logit[:, None]
        #initialize prob
        prob = np.exp(logit)
        prob/= np.sum(np.exp(logit), axis=1)[:, None]
        return prob
#         raise NotImplementedError

    def logsumexp(self, logit): # [5pts]
        """
        Args:
            logit: N x D numpy array
        Return:
            s: N x 1 array where s[i,0] = logsumexp(logit[i,:]). See the above function
        """
        #max of each row in logit
        max_logit = np.max(logit, axis=1)
        #subtract max of each row from logit
        logit-= max_logit[:, None]
        #initialize s
        s = np.log(np.sum(np.exp(logit), axis=1)[:, None] + 1e-32)
        #add max back to s
        s+=max_logit[:, None]
        return s
#         raise NotImplementedError

    #for undergraduate student
    def normalPDF(self, logit, mu_i, sigma_i): #[5pts]
        """
        Args: 
            logit: N x D numpy array
            mu_i: 1xD numpy array (or array of lenth D), the center for the ith gaussian.
            sigma_i: 1xDxD 3-D numpy array (or DxD 2-D numpy array), the covariance matrix of the ith gaussian.  
        Return:
            pdf: 1xN numpy array (or array of length N), the probability distribution of N data for the ith gaussian
            
        Hint: 
            np.diagonal() should be handy.
        """
        
        raise NotImplementedError
    
    #for grad students
    def multinormalPDF(self, logits, mu_i, sigma_i):  #[5pts]
        """
        Args: 
            logit: N x D numpy array
            mu_i: 1xD numpy array (or array of lenth D), the center for the ith gaussian.
            sigma_i: 1xDxD 3-D numpy array (or DxD 2-D numpy array), the covariance matrix of the ith gaussian.  
        Return:
            pdf: 1xN numpy array (or array of length N), the probability distribution of N data for the ith gaussian
         
        Hint: 
            np.linalg.det() and np.linalg.inv() should be handy.
        """
        N,D = logits.shape[0], logits.shape[1]
        mu_i.reshape((D,1))
        #term1
        #adding very small value to sigma to avoid singular matrix error
        t1 = np.dot((logits - mu_i.T[None, :]),np.linalg.inv(sigma_i+0.00001)).T
        #term2
        t2 = (logits - mu_i.T[None, :]).T
        #element-wise multiplication
        t3 = np.exp(np.sum(np.multiply(t1,t2), axis=0)*(-0.5))
        #term 4
        t4 = np.power(np.linalg.det(sigma_i), -0.5)
        #term 5
        t5 = np.power(np.power((2*np.pi), D/2), -1)
        #putting it all together
        pdf = t5*t4*t3
        pdf.reshape((1,N))
        return pdf        
#         raise NotImplementedError
    
    
    def _init_components(self, **kwargs): # 
        """
        Args:
            kwargs: any other arguments you want
        Return:
            pi: numpy array of length K, prior
            mu: KxD numpy array, the center for each gaussian. 
            sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian. 
        """
        N, D, K = self.N, self.D, self.K
        points = self.points
        
        #initialize pi as same probability for each class
        pi = np.array([1/K] * K)
        
        #randomly select K observations as mean vectors
        indices = np.random.choice(points.shape[0], size=K, replace=False)
        # form the numpy array with these indices
        mu = points[indices, :]
        
        #sigma is np.eye for each K
        sigma = np.array([np.eye(D)]*K)
        
        return pi, mu, sigma
#         raise NotImplementedError

    
    def _ll_joint(self, pi, mu, sigma, **kwargs): # 
        """
        Args:
            pi: np array of length K, the prior of each component
            mu: KxD numpy array, the center for each gaussian. 
            sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian. You will have KxDxD numpy
            array for full covariance matrix case
            
        Return:
            ll(log-likelihood): NxK array, where ll(i, k) = log pi(k) + log NormalPDF(points_i | mu[k], sigma[k])
        """
        N, D, K = self.N, self.D, self.K
        logits = self.points
        # initialize ll
        ll = np.zeros((N, K))

        # term1
        t1 = np.log(pi + 1e-32)
        # add t1 to ll
        ll = ll + t1

        # term2
        for i in range(K):
            mu_i = mu[i, :]
            sigma_i = sigma[i, :, :]
            pdf = self.multinormalPDF(logits, mu_i, sigma_i)
            pdf = np.nan_to_num(pdf)
            ll[:, i] += np.log(pdf + 1e-32)
            
#         np.nan_to_num(ll)

        return ll

    def _E_step(self, pi, mu, sigma, **kwargs): #
        """
        Args:
            pi: np array of length K, the prior of each component
            mu: KxD numpy array, the center for each gaussian. 
            sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian.You will have KxDxD numpy
            array for full covariance matrix case
        Return:
            gamma(tau): NxK array, the posterior distribution (a.k.a, the soft cluster assignment) for each observation.
            
        Hint: 
            You should be able to do this with just a few lines of code by using _ll_joint() and softmax() defined above. 
        """
        logits = self.points
        N, D, K = self.N, self.D, self.K
        
        #initialize tau
        tau = np.zeros((N, K))
        
        for i in range(K):
            mu_i = mu[i, :]
            sigma_i = sigma[i, :, :]
            pdf = self.multinormalPDF(logits, mu_i, sigma_i)
            if np.isnan(pdf).any():
                print('pdf has 0')
            tau[:, i] = np.dot(pi[i], pdf).T
            
        #term2, summation
        t2 = np.sum(tau, axis=1)
        tau = tau/t2[:, None]
        
        return tau       
#         raise NotImplementedError

    def _M_step(self, gamma, **kwargs): # 
        """
        Args:
            gamma(tau): NxK array, the posterior distribution (a.k.a, the soft cluster assignment) for each observation.
        Return:
            pi: np array of length K, the prior of each component
            mu: KxD numpy array, the center for each gaussian. 
            sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian. You will have KxDxD numpy
            array for full covariance matrix case
            
        Hint:  
            There are formulas in the slide and in the above description box.
        """
        N_k = np.sum(gamma, axis=0)
        pi, mu, sigma = self._init_components()
        logits = self.points
        N, D, K = self.N, self.D, self.K
        
        #mu_k
        mu_new = np.dot(gamma.T, logits)/ N_k.reshape(-1, 1)
            
        #sigma_k
        sigma_new = np.zeros((K, D, D))
        for i in range(K):
            #term1
            t1 = (logits - mu_new[i]).T
            #term2, element-wise multiplication
            t2 = gamma[:, i] * t1
#             t2 = np.multiply(gamma[:, i], t1)
            #term3, dot product
            t3 = np.dot(t2, (logits - mu_new[i]))
            sigma_new[i] = t3/N_k[i]           
        
        #pi_k
        pi_new = np.zeros((K))
        pi_new = N_k/N
        pi_new.reshape((K))
        
        return pi_new, mu_new, sigma_new        
#         raise NotImplementedError
    
    
    def __call__(self, abs_tol=1e-16, rel_tol=1e-16, **kwargs): # No need to change
        """
        Args:
            abs_tol: convergence criteria w.r.t absolute change of loss
            rel_tol: convergence criteria w.r.t relative change of loss
            kwargs: any additional arguments you want
        
        Return:
            gamma(tau): NxK array, the posterior distribution (a.k.a, the soft cluster assignment) for each observation.
            (pi, mu, sigma): (1xK np array, KxD numpy array, KxDxD numpy array)       
        
        Hint: 
            You do not need to change it. For each iteration, we process E and M steps, then update the paramters. 
        """
        pi, mu, sigma = self._init_components(**kwargs)
        if np.isnan(pi).any():
            print('pi has 0')
        if np.isnan(mu).any():
            print('mu has 0')
        if np.isnan(sigma).any():
            print('sigma has 0')
        pbar = tqdm(range(self.max_iters))
        
        for it in pbar:
            # E-step
            gamma = self._E_step(pi, mu, sigma)
            gamma = np.nan_to_num(gamma)
            if np.isnan(gamma).any():
                print('gamma has 0')
            
            # M-step
            pi, mu, sigma = self._M_step(gamma)
            pi = np.nan_to_num(pi)
            mu = np.nan_to_num(mu)
            sigma = np.nan_to_num(sigma)
            if np.isnan(pi).any():
                print('pi_new has 0')
            if np.isnan(mu).any():
                print('mu_new has 0')
            if np.isnan(sigma).any():
                print('sigma_new has 0')
            
            # calculate the negative log-likelihood of observation
            joint_ll = self._ll_joint(pi, mu, sigma)
            joint_ll = np.nan_to_num(joint_ll)
            if np.isnan(joint_ll).any():
                print('joint_ll has 0')
            loss = -np.sum(self.logsumexp(joint_ll))
            if it:
                diff = np.abs(prev_loss - loss)
                if diff < abs_tol and diff / prev_loss < rel_tol:
                    break
            prev_loss = loss
            pbar.set_description('iter %d, loss: %.4f' % (it, loss))
        return gamma, (pi, mu, sigma)

 
3.3 Japanese art and pixel clustering [10pts + 5pts]
Ukiyo-e is a Japanese art genre predominant from the 17th through 19th centuries. In order to produce the intricate prints that came to represent the genre, artists carved wood blocks with the patterns for each color in a design. Paint would be applied to the block and later transfered to the print to form the image. In this section, you will use your GMM algorithm implementation to do pixel clustering and estimate how many wood blocks were likely used to produce a single print. That is to say, how many wood blocks would appropriatly produce the original paint. (Hint: you can justify your answer based on visual inspection of the resulting images or on a different metric of your choosing)

You do NOT need to submit your code for this question to the autograder. Instead you should include whatever images/information you find relevant in the report.
In [14]:
# helper function for performing pixel clustering. You don't have to modify it
def cluster_pixels_gmm(image, K):
    """Clusters pixels in the input image
    
    Args:
        image: input image of shape(H, W, 3)
        K: number of components
    Return:
        clustered_img: image of shape(H, W, 3) after pixel clustering
    """
    im_height, im_width, im_channel = image.shape
    flat_img = np.reshape(image, [-1, im_channel]).astype(np.float32)
    gamma, (pi, mu, sigma) = GMM(flat_img, K = K, max_iters = 100)()
    cluster_ids = np.argmax(gamma, axis=1)
    centers = mu

    gmm_img = np.reshape(centers[cluster_ids], (im_height, im_width, im_channel))
    
    return gmm_img

# helper function for plotting images. You don't have to modify it
def plot_images(img_list, title_list, figsize=(20, 10)):
    assert len(img_list) == len(title_list)
    fig, axes = plt.subplots(1, len(title_list), figsize=figsize)
    for i, ax in enumerate(axes):
        ax.imshow(img_list[i] / 255.0)
        ax.set_title(title_list[i])
        ax.axis('off')

In [15]:
# pick 2 of the images in this list:
url0 = 'https://upload.wikimedia.org/wikipedia/commons/b/b1/Utagawa_Kunisada_I_%28c._1832%29_Dawn_at_Futami-ga-ura.jpg'
url1 = 'https://upload.wikimedia.org/wikipedia/commons/9/95/Hokusai_%281828%29_Cuckoo_and_Azaleas.jpg'
url2 = 'https://upload.wikimedia.org/wikipedia/commons/7/74/Kitao_Shigemasa_%281777%29_Geisha_and_a_servant_carrying_her_shamisen_box.jpg'
url3 = 'https://upload.wikimedia.org/wikipedia/commons/1/10/Kuniyoshi_Utagawa%2C_Suikoden_Series_4.jpg'

# example of loading image from url0
image0 = imageio.imread(imageio.core.urlopen(url1).read())
image1 = imageio.imread(imageio.core.urlopen(url3).read())


# this is for you to implement
def find_n_woodblocks(image, min_clusters=5, max_clusters=15):
    """
    Using the helper function above to find the optimal number of woodblocks that can appropriatly produce a single image.
    You can simply examinate the answer based on your visual inspection (i.e. looking at the resulting images) or provide any metrics you prefer. 
    
    Args:
        image: input image of shape(H, W, 3)
        min_clusters, max_clusters: the minimum and maximum number of clusters you should test with. Default are 5a dn 15.
        (Usually the maximum number of clusters would not exeed 15)
        
    Return:
        plot: comparison between original image and image pixel clustering.
        optional: any other information/metric/plot you think is necessary.
    """
    img_list = [image]
    title_list = ['original image']
    for num_clusters in range(min_clusters, max_clusters+1):
        np.seterr(divide='ignore', invalid='ignore')
        gmm_img = cluster_pixels_gmm(image, num_clusters)
        img_list.append(gmm_img)
        title_list.append('GMM #clusters = ' + str(num_clusters))
        
    plot_images(img_list, title_list)   
    
find_n_woodblocks(image0, 5, 8)
find_n_woodblocks(image1, 5, 8)
#     raise NotImplementedError

 
iter 99, loss: 8833709.0903: 100%|██████████| 100/100 [01:35<00:00,  1.04it/s]
iter 99, loss: 8824229.8151: 100%|██████████| 100/100 [02:00<00:00,  1.20s/it]
iter 99, loss: 8755945.0012: 100%|██████████| 100/100 [02:30<00:00,  1.51s/it]
iter 99, loss: 8780598.6015: 100%|██████████| 100/100 [02:40<00:00,  1.60s/it]
iter 99, loss: 9686920.7530: 100%|██████████| 100/100 [01:35<00:00,  1.05it/s]
iter 99, loss: 9593794.9253: 100%|██████████| 100/100 [02:00<00:00,  1.21s/it]
iter 99, loss: 9608781.9818: 100%|██████████| 100/100 [02:47<00:00,  1.67s/it]
iter 99, loss: 9567262.3851: 100%|██████████| 100/100 [02:55<00:00,  1.76s/it]

 
 
 
 
 
For my experiments, I used the images 2 and 4, and ran the GMM clustering for various number of clusters. As we can see from the images above, using 8 clusters almost generates the original image. Hence, I estimate that the number if woodblocks used to produce each print were 8.
 
(Bonus for All) [5 pts]
Compare the full covariance matrix with the diagonal covariance matrix in GMM. Can you explain why the images are different with the same clusters? Note: You will have to implement both multinormalPDF and normalPDF, and add a few arguments in the original _ll_joint() and _Mstep() function to indicate which matrix you are using. You will earn full credit only if you implement both functions AND explain the reason.
In [ ]:
def compare_matrix(image, K):
    """
    Args:
        image: input image of shape(H, W, 3)
        K: number of components
        
    Return:
        plot: comparison between full covariance matrix and diagonal covariance matrix.
    """
    raise NotImplementedError
    

In [ ]:
compare_matrix(image1, 5)

 
4. (Bonus for Grad and Undergrad) A Wrench in the Machine [30pts]
Learning to work with messy data is a hallmark of a well-rounded data scientist. In most real-world settings the data given will usually have some issue, so it is important to learn skills to work around such impasses. This part of the assignment looks to expose you to clever ways to fix data using concepts that you have already learned in the prior questions.

The two solutions covered:

KNN Algorithm Approach
EM Algorithm Approach



Question
You are a consultant assigned to a company which refines raw materials. To refine the raw materials necessary for their operations, the company owns a vast fleet of machines. Stressing the importance of having minimum down time for refining, you have been tasked to find a way to predict whether a machine will need to be repaired or not. In order to aid you on the task, the company has supplied you with historical telemetric data from all of the machines. The features range from averages of temperature, frequencies, and other salient observations of the units. The specifics of the features are not pertinent to the classification; it can be assured that each feature is statistically significant. A unit is given a 1 if it is broken and a 0 otherwise.

However, due to a software bug in logging the telemetric data, 20% of the entries are missing labels and 30% are missing characterization data. Since simply removing the corrupted entries would not reflect the true variance of the data, your job is to implement a solution to clean the data so it can be properly classified.

Your job is to assist the company in cleaning their data and implementing a semi-supervised learning framework to help them create a general classifier.

You are given two files for this task:

telemetry_data.csv: the entire dataset with complete and incomplete data
validation_data.csv: a smaller, fully complete dataset made after the software bug had been fixed
 
4.1.a Data Cleaning
The first step is to break up the whole dataset into clear parts. All the data is randomly shuffled in one csv file. In order to move forward, the data needs to be split into three separate arrays:

labeled_complete: containing the complete characterization data and corresponding labels (broken = 1 and OK = 0)
labeled_incomplete: containing partial characterization data and corresponding labels (broken = 1 and OK = 0)
unlabeled_complete: containing only complete material characterization results
In [ ]:
def complete_(data):
    """
    Args:
        data: N x D numpy array    
    Return:
        labeled_complete: n x D array where values contain both complete features and labels
    """
    raise NotImplementedError
    
def incomplete_(data):
    """
    Args:
        data: N x D numpy array    
    Return:
        labeled_incomplete: n x D array where values contain incomplete features but complete labels
    """
    raise NotImplementedError

def unlabeled_(data):
    """
    Args:
        data: N x D numpy array    
    Return:
        unlabeled_complete: n x D array where values contain complete features but incomplete labels
    """
    raise NotImplementedError

 
4.1.b KNN [10pts]
The second step in this task is to clean the Labeled_incomplete dataset by filling in the missing values with probable ones derived from complete data. A useful approach to this type of problem is using a k-nearest neighbors (k-NN) algorithm. For this application, the method consists of replacing the missing value of a given point with the mean of the closest k-neighbors to that point.
In [ ]:
class CleanData(object):
    def __init__(self): # No need to implement
        pass
    
    def pairwise_dist(self, x, y): # [0pts] - copy from kmeans
        """
        Args:
            x: N x D numpy array
            y: M x D numpy array
        Return:
            dist: N x M array, where dist2[i, j] is the euclidean distance between 
            x[i, :] and y[j, :]
        """
        #l2 norm, ord=2
        dist = np.linalg.norm(x[:, np.newaxis, :] - y, ord=2, axis=2)
        return dist
#         raise NotImplementedError
    
    def __call__(self, incomplete_points,  complete_points, K, **kwargs): # [10pts]
        """
        Args:
            incomplete_points: N_incomplete x (D+1) numpy array, the incomplete labeled observations
            complete_points: N_complete x (D+1) numpy array, the complete labeled observations
            K: integer, corresponding to the number of nearest neighbors you want to base your calculation on
            kwargs: any other args you want
        Return:
            clean_points: (N_incomplete + N_complete) x (D-1) X D numpy array of length K, containing both complete points and recently filled points
            
        Hints: (1) You want to find the k-nearest neighbors within each class separately;
               (2) There are missing values in all of the features. It might be more convenient to address each feature at a time.
        """
        raise NotImplementedError            

 
Below is a good expectation of what the process should look like on a toy dataset. If your output matches the answer below, you are on the right track.
In [ ]:
complete_data = np.array([[1.,2.,3.,1],[7.,8.,9.,0],[16.,17.,18.,1],[22.,23.,24.,0]])
incomplete_data = np.array([[1.,np.nan,3.,1],[7.,np.nan,9.,0],[np.nan,17.,18.,1],[np.nan,23.,24.,0]])

clean_data = CleanData()(incomplete_data, complete_data, 2)
print("*** Expected Answer - k = 2 ***")
print("""==complete data==
[[ 1.  5.  3.  1.]
 [ 7.  8.  9.  0.]
 [16. 17. 18.  1.]
 [22. 23. 24.  0.]]
==incomplete data==
[[ 1. nan  3.  1.]
 [ 7. nan  9.  0.]
 [nan 17. 18.  1.]
 [nan 23. 24.  0.]]
==clean_data==
[[ 1.   2.   3.   1. ]
 [ 7.   8.   9.   0. ]
 [16.  17.  18.   1. ]
 [22.  23.  24.   0. ]
 [14.5 23.  24.   0. ]
 [ 7.  15.5  9.   0. ]
 [ 8.5 17.  18.   1. ]
 [ 1.   9.5  3.   1. ]]""")

print("\n*** My Answer - k = 2***")
print(clean_data)

 
4.2 Getting acquainted with semi-supervised learning approaches. [5pts]
You will implement a version of the algorithm presented in Table 1 of the paper "Text Classification from Labeled and Unlabeled Documents using EM" by Nigam et al. (2000). While you are recommended to read the whole paper this assignment focuses on items 1−−5.2 and 6.1. Write a brief summary of three interesting highlights of the paper (50-word maximum).
 
4.3 Implementing the EM algorithm. [10 pts]
In your implementation of the EM algorithm proposed by Nigam et al. (2000) on Table 1, you will use a Gaussian Naive Bayes (GNB) classifier as opposed to a naive Bayes (NB) classifier. (Hint: Using a GNB in place of an NB will enable you to reuse most of the implementation you developed for GMM in this assignment. In fact, you can successfully solve the problem by simply modifying the call method.)
In [ ]:
class SemiSupervised(object):
    def __init__(self): # No need to implement
        pass
    
    def softmax(self,logits): # [0 pts] - can use same as for GMM
        """
        Args:
        logits: N x D numpy array
        """
        raise NotImplementedError

    def logsumexp(self,logits): # [0 pts] - can use same as for GMM
        """
        Args:
            logits: N x D numpy array
        Return:
            s: N x 1 array where s[i,0] = logsumexp(logits[i,:])
        """
        raise NotImplementedError
    
    def _init_components(self, points, K, **kwargs): # [5 pts] - modify from GMM
        """
        Args:
            points: Nx(D+1) numpy array, the observations
            K: number of components
            kwargs: any other args you want
        Return:
            pi: numpy array of length K, prior
            mu: KxD numpy array, the center for each gaussian. 
            sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian.
            
        Hint: The paper describes how you should initialize your algorithm.
        """
        raise NotImplementedError

    def _ll_joint(self, points, pi, mu, sigma, **kwargs): # [0 pts] - can use same as for GMM
        """
        Args:
            points: NxD numpy array, the observations
            pi: np array of length K, the prior of each component
            mu: KxD numpy array, the center for each gaussian. 
            sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian.
        Return:
            ll(log-likelihood): NxK array, where ll(i, j) = log pi(j) + log NormalPDF(points_i | mu[j], sigma[j])
            
        Hint: Assume that the three properties of the lithium-ion batteries (multivariate gaussian) are independent.  
              This allows you to treat it as a product of univariate gaussians.
        """
        raise NotImplementedError

    def _E_step(self, points, pi, mu, sigma, **kwargs): # [0 pts] - can use same as for GMM
        """
        Args:
            points: NxD numpy array, the observations
            pi: np array of length K, the prior of each component
            mu: KxD numpy array, the center for each gaussian. 
            sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian.
        Return:
            gamma: NxK array, the posterior distribution (a.k.a, the soft cluster assignment) for each observation.
            
        Hint: You should be able to do this with just a few lines of code by using _ll_joint() and softmax() defined above. 
        """
        raise NotImplementedError

    def _M_step(self, points, gamma, **kwargs): # [0 pts] - can use same as for GMM
        """
        Args:
            points: NxD numpy array, the observations
            gamma: NxK array, the posterior distribution (a.k.a, the soft cluster assignment) for each observation.
        Return:
            pi: np array of length K, the prior of each component
            mu: KxD numpy array, the center for each gaussian. 
            sigma: KxDxD numpy array, the diagonal standard deviation of each gaussian. 
            
        Hint:  There are formulas in the slide.
        """
        raise NotImplementedError

    def __call__(self, points, K, max_iters=100, abs_tol=1e-16, rel_tol=1e-16, **kwargs): # [5 pts] - modify from GMM
        """
        Args:
            points: NxD numpy array, where N is # points and D is the dimensionality
            K: number of clusters
            max_iters: maximum number of iterations
            abs_tol: convergence criteria w.r.t absolute change of loss
            rel_tol: convergence criteria w.r.t relative change of loss
            kwargs: any additional arguments you want
        Return:
            gamma: NxK array, the posterior distribution (a.k.a, the soft cluster assignment) for each observation.
            (pi, mu, sigma): (1xK np array, KxD numpy array, KxD numpy array), mu and sigma.
         
        """
        raise NotImplementedError

 
4.4 Demonstrating the performance of the algorithm. [5pts]
Compare the classification error based on the Gaussian Naive Bayes (GNB) classifier you implemented following the Nigam et al. (2000) approach to the performance of a GNB classifier trained using only labeled data. Since you have not covered supervised learning in class, you are allowed to use the scikit learn library for training the GNB classifier based only on labeled data: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html.
In [ ]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

class ComparePerformance(object):
    
    def __init__(self): #No need to implement
        pass
    
    
    def accuracy_semi_supervised(self, points, independent, n=8):
        """
        Args:
            points: Nx(D+1) numpy array, where N is the number of points in the training set, D is the dimensionality, the last column
            represents the labels (when available) or a flag that allows you to separate the unlabeled data.
            independent: Nx(D+1) numpy array, where N is # points and D is the dimensionality and the last column are the correct labels
        Return:
            accuracy: floating number
        """
        raise NotImplementedError

    def accuracy_GNB_onlycomplete(self, points, independent, n=8):
        """
        Args:
            points: Nx(D+1) numpy array, where N is the number of only initially complete labeled points in the training set, D is the dimensionality, the last column
            represents the labels.
            independent: Nx(D+1) numpy array, where N is # points and D is the dimensionality and the last column are the correct labels
        Return:
            accuracy: floating number
        """
        raise NotImplementedError

    def accuracy_GNB_cleandata(self, points, independent, n=8):
        """
        Args:
            points: Nx(D+1) numpy array, where N is the number of clean labeled points in the training set, D is the dimensionality, the last column
            represents the labels.
            independent: Nx(D+1) numpy array, where N is # points and D is the dimensionality and the last column are the correct labels
        Return:
            accuracy: floating number
        """
        raise NotImplementedError

In [ ]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
# Load and clean data for the next section
telemetry = np.loadtxt('data/telemetry.csv', delimiter=',')

labeled_complete = complete_(telemetry)
labeled_incomplete = incomplete_(telemetry)
unlabeled = unlabeled_(telemetry)

clean_data = CleanData()(labeled_incomplete, labeled_complete, 7)
# load unlabeled set
# append unlabeled flag
unlabeled_flag = -1*np.ones((unlabeled.shape[0],1))
unlabeled = np.concatenate((unlabeled, unlabeled_flag), 1)
unlabeled = np.delete(unlabeled, -1, axis=1)

# =========================================================================
# SEMI SUPERVISED

# format training data
points = np.concatenate((clean_data, unlabeled),0)

# train model
(pi, mu, sigma) = SemiSupervised()(points, 7)

# ==========================================================================
# COMPARISON

# load test data
independent = np.loadtxt('data/validation.csv', delimiter=',')

# classify test data
classification = SemiSupervised()._E_step(independent[:,:8], pi, mu, sigma)
classification = np.argmax(classification,axis=1)

# =========================================================================================

print("""===COMPARISON===""")
print("""SemiSupervised Accuracy:""", ComparePerformance().accuracy_semi_supervised(classification, independent))
print("""Supervised with clean data: GNB Accuracy:""", ComparePerformance().accuracy_GNB_onlycomplete(labeled_complete, independent))
print("""Supervised with only complete data: GNB Accuracy:""", ComparePerformance().accuracy_GNB_cleandata(clean_data, independent))

More products