Starting from:

$30

CS7641-Homework 3 Image compression with SVD, PCA, Polynomial Regression and Regularization Solved

 Image compression with SVD **[P]** **[W]**
 
Load images data and plot
In [2]:
# HELPER CELL, DO NOT MODIFY
# load Image
image = plt.imread("hw3_image_2.jpg")/255.
#plot image
fig = plt.figure(figsize=(10,10))
plt.imshow(image)

Out[2]:
<matplotlib.image.AxesImage at 0x7fd7f813eef0
 
 
In [3]:
# HELPER CELL, DO NOT MODIFY
def rgb2gray(rgb):   
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

fig = plt.figure(figsize=(10, 10))
# plot several images
plt.imshow(rgb2gray(image), cmap=plt.cm.bone)

Out[3]:
<matplotlib.image.AxesImage at 0x7fd7f8c6b0f0
 
 
 
1.1 Image compression  **[P]**
SVD is a dimensionality reduction technique that allows us to compress images by throwing away the least important information.

Higher singular values capture greater variance and thus capture greater information from the corresponding singular vector. To perform image compression, apply SVD on each matrix and get rid of the small singular values to compress the image. The loss of information through this process is negligible and the difference between the images can hardly be spotted. For example, the variance captured by the first component

σ1∑ni=1σiσ1∑i=1nσi
where σiσi is the ithith singular value. You need to finish the following functions to do SVD and then reconstruct the image by components.

Hint 1: http://timbaumann.info/svd-image-compression-demo/ is an useful article on image compression.
In [4]:
class ImgCompression(object):
    def __init__(self):
        pass

    def svd(self, X): # [5pts]
        """
        Do SVD. You could use numpy SVD.
        Your function should be able to handle black and white
        images (N*D arrays) as well as color images (N*D*3 arrays)
        In the image compression, we assume that each colum of the image is a feature. Image is the matrix X.
        Args:
            X: N * D array corresponding to an image (N * D * 3 if color image)
        Return:
            U: N * N for black and white images / N * N * 3 for color images
            S: min(N, D) * 1 for black and white images / min(N, D) * 3 for color images
            V: D * D for black and white images / D * D * 3 for color images
        """
        # BnW image
        if len(X.shape) == 2:
            U, S, V = np.linalg.svd(X)
        else:
            # colour image
            N, D = X.shape[0], X.shape[1]
            min_ND = min(N, D)

            # channel0
            U0, S0, V0 = np.linalg.svd(X[:, :, 0])
            S0 = S0.reshape((min_ND, 1))

            # channel1
            U1, S1, V1 = np.linalg.svd(X[:, :, 1])
            S1 = S1.reshape((min_ND, 1))

            # channel2
            U2, S2, V2 = np.linalg.svd(X[:, :, 2])
            S2 = S2.reshape((min_ND, 1))

            #combining
            U = np.array([U0, U1, U2])
            U = U.transpose(1, 2, 0)

            S = np.concatenate((S0, S1, S2), axis=1)

            V = np.array([V0, V1, V2])
            V = V.transpose(1, 2, 0)
        return U, S, V
        
#         raise NotImplementedError


    def rebuild_svd(self, U, S, V, k): # [5pts]
        """
        Rebuild SVD by k componments.
        Args:
            U: N*N (*3 for color images)
            S: min(N, D)*1 (*3 for color images)
            V: D*D (*3 for color images)
            k: int corresponding to number of components
        Return:
            Xrebuild: N*D array of reconstructed image (N*D*3 if color image)

        Hint: numpy.matmul may be helpful for reconstructing color images
        """
        if len(U.shape) == 2:
            # BnW image
            # term1
            t1 = np.matmul(U[:, :k], np.diag(S)[:k, :k])
            # term2
            Xrebuild = np.matmul(t1, V[:k, :])
        else:
            # colour image
            N = U.shape[0]
            D = V.shape[0]

            # channel 0
            U0 = U[:, :, 0]
            S0 = S[:, 0]
            V0 = V[:, :, 0]
            # term1
            t1 = np.matmul(U0[:, :k], np.diag(S0)[:k, :k])
            # term2
            Xrebuild0 = np.matmul(t1, V0[:k, :])

            # channel 1
            U1 = U[:, :, 1]
            S1 = S[:, 1]
            V1 = V[:, :, 1]
            # term1
            t1 = np.matmul(U1[:, :k], np.diag(S1)[:k, :k])
            # term2
            Xrebuild1 = np.matmul(t1, V1[:k, :])

            # channel 2
            U2 = U[:, :, 2]
            S2 = S[:, 2]
            V2 = V[:, :, 2]
            # term1
            t1 = np.matmul(U2[:, :k], np.diag(S2)[:k, :k])
            # term2
            Xrebuild2 = np.matmul(t1, V2[:k, :])

            # combining
            Xrebuild = np.array([Xrebuild0, Xrebuild1, Xrebuild2])
            Xrebuild = Xrebuild.transpose(1, 2, 0)

        return Xrebuild   

#         raise NotImplementedError

    def compression_ratio(self, X, k): # [5pts]
        """
        Compute compression of an image: (num stored values in compressed)/(num stored values in original)
        Args:
            X: N * D array corresponding to an image (N * D * 3 if color image)
            k: int corresponding to number of components
        Return:
            compression_ratio: float of proportion of storage used by compressed image
        """
        if len(X.shape) == 2:
            # BnW image
            num_orig = X.shape[0] * X.shape[1]
            num_compress = k * (1 + X.shape[0] + X.shape[1])
        else:
            # colour image
            num_orig = X.shape[0] * X.shape[1] * X.shape[2]
            num_compress = k * (1 + X.shape[0] + X.shape[1]) * X.shape[2]
            
        compression_ratio = num_compress * 1.0 / num_orig
        return compression_ratio
        
#         raise NotImplementedError

    def recovered_variance_proportion(self, S, k): # [5pts]
        """
        Compute the proportion of the variance in the original matrix recovered by a rank-k approximation

        Args:
           S: min(N, D)*1 (*3 for color images) of singular values for the image
           k: int, rank of approximation
        Return:
           recovered_var: float (array of 3 floats for color image) corresponding to proportion of recovered variance
        """
        S2 = S ** 2
        if len(S.shape) == 1:
            # BnW image
            recovered_var = np.sum(S2[:k]) / np.sum(S2)
        else:
            # colour image
            # channel0
            recovered_var0 = np.sum(S2[:k, 0]) / np.sum(S2[:, 0])
            # channel1
            recovered_var1 = np.sum(S2[:k, 1]) / np.sum(S2[:, 1])
            # channel2
            recovered_var2 = np.sum(S2[:k, 2]) / np.sum(S2[:, 2])
            recovered_var = [recovered_var0, recovered_var1, recovered_var2]

        return recovered_var        
        
#         raise NotImplementedError

 
1.2 Black and white  **[W]**
Use your implementation to generate a set of images compressed to different degrees. Include the images in your non-programming submission to the assignment.
In [5]:
# HELPER CELL, DO NOT MODIFY
imcompression = ImgCompression()
bw_image = rgb2gray(image)
U, S, V = imcompression.svd(bw_image)
component_num = [1,2,5,10,20,40,80,160,256]

fig = plt.figure(figsize=(18, 18))

# plot several images
i=0
for k in component_num:
    img_rebuild = imcompression.rebuild_svd(U, S, V, k)
    c = np.around(imcompression.compression_ratio(bw_image, k), 4)
    r = np.around(imcompression.recovered_variance_proportion(S, k), 3)
    ax = fig.add_subplot(3, 3, i + 1, xticks=[], yticks=[])
    ax.imshow(img_rebuild, cmap=plt.cm.bone)
    ax.set_title(f"{k} Components")
    ax.set_xlabel(f"Compression: {c},\nRecovered Variance: {r}")
    i = i+1

 
 
 
1.3 Color image [5 pts] **[W]**
Use your implementation to generate a set of images compressed to different degrees. Include the images in your non-programming submission to the assignment.

Note: You might get warning "Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers)." This warning is acceptable since while rebuilding some of the pixels may go above 1.0. You should see similar image to original even with such clipping.
In [6]:
# HELPER CELL, DO NOT MODIFY
imcompression = ImgCompression()
U, S, V = imcompression.svd(image)

# component_num = [1,2,5,10,20,40,80,160,256]
component_num = [1,2,5,10,20,40,80,160,256]

fig = plt.figure(figsize=(18, 18))

# plot several images
i=0
for k in component_num:
    img_rebuild = imcompression.rebuild_svd(U, S, V, k)
    c = np.around(imcompression.compression_ratio(image, k), 4)
    r = np.around(imcompression.recovered_variance_proportion(S, k), 3)
    ax = fig.add_subplot(3, 3, i + 1, xticks=[], yticks=[])
    ax.imshow(img_rebuild)
    ax.set_title(f"{k} Components")
    ax.set_xlabel(f"Compression: {np.around(c,4)},\nRecovered Variance:  R: {r[0]}  G: {r[1]}  B: {r[2]}")
    i = i+1

 
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

 
 
 
2 Understanding PCA [15 pts] **[P]** | **[W]**
 
2.1 Implementation [10 pts] **[P]**
Principal Component Analysis (PCA) is another dimensionality reduction technique that reduces dimensions by eliminating small variance eigenvalues and their vectors. With PCA, we center the data first by subtracting the mean. Each singular value tells us how much of the variance of a matrix (e.g. image) is captured in each component. In this problem, we will investigate how PCA can be used to improve features for regression and classification tasks and how the data itself affects the behavior of PCA.

Implement PCA in the below cell.

Assume a dataset is composed of N datapoints, each of which has D features with D < N. The dimension of our data would be D. It is possible, however, that many of these dimensions contain redundant information. Each feature explains part of the variance in our dataset. Some features may explain more variance than others.

In the following cell complete the PCA class by completing functions fit, transform and transform_rv.
In [7]:
class PCA(object):

    def __init__(self):
        self.U = None
        self.S = None
        self.V = None

    def fit(self, X): # 5 points
        """
        Decompose dataset into principal components.
        You may use your SVD function from the previous part in your implementation or numpy.linalg.svd function.

        Don't return anything. You can directly set self.U, self.S and self.V declared in __init__ with
        corresponding values from PCA.

        Args:
            X: N*D array corresponding to a dataset
        Return:
            None
        """
        mu = np.mean(X, axis = 0, keepdims = True)
        X = X - mu
        U, S, V = np.linalg.svd(X, full_matrices = False)
        self.U = U
        self.S = S
        self.V = V
        
#         raise NotImplementedError

    def transform(self, data, K=2): # 2 pts
        """
        Transform data to reduce the number of features such that final data has given number of columns

        Utilize self.U, self.S and self.V that were set in fit() method.

        Args:
            data: N*D array corresponding to a dataset
            K: Int value for number of columns to be kept
        Return:
            X_new: N*K array corresponding to data obtained by applying PCA on data
        """
        self.fit(data)
        X_new = np.dot(data, self.V.T[:, :K])
        return X_new
        
#         raise NotImplementedError

    def transform_rv(self, data, retained_variance=0.99): # 3 pts
        """
        Transform data to reduce the number of features such that a given variance is retained

        Utilize self.U, self.S and self.V that were set in fit() method.

        Args:
            data: N*D array corresponding to a dataset
            retained_variance: Float value for amount of variance to be retained
        Return:
            X_new: N*K array corresponding to data obtained by applying PCA on data
        """
        self.fit(data)
        var = np.cumsum(self.S ** 2)
        var = var / var[-1]

        N = data.shape[0]

        for i in range(N):
            if var[i] retained_variance:
                break

        X_new = np.dot(data, self.V.T[:, :i + 1])
        return X_new        

#         raise NotImplementedError

    def get_V(self):
        """ Getter function for value of V """
        
        return self.V

 
2.2 Visualize [5 pts] **[W]**
PCA is used to transform multivariate data tables into smaller sets so as to observe the hidden trends and variations in the data. Here you will visualize two datasets (iris and wine) using PCA. Use the above implementation of PCA and reduce the datasets such that they contain only two features. Make 2-D scatter plots of the data points using these features. Make sure to differentiate the data points according to their true labels. The datasets have already been loaded for you. In addition, return the retained variance obtained from the reduced features.
In [8]:
def visualize(X,y): # 5 pts
    """
    Args:
        xtrain: NxD numpy array, where N is number of instances and D is the dimensionality of each instance
        ytrain: numpy array (N,), the true labels
  
    Return:
        None
    """
    N = X.shape[0]
    
    #do dimensionality reduction
    obj = PCA()
    x_new = obj.transform(X, 2)
    
    #add y values as column to xnew    
    y_new = y.reshape((N, 1))
    x = np.hstack((x_new, y_new))
    
    #make scatter plot
    plot = plt.scatter(x[:, 0], x[:, 1], c = x[:, 2], marker = 'x')
    
#     raise NotImplementedError

In [9]:
# HELPER CELL, DO NOT MODIFY
#Use PCA for visualization of iris and wine data
iris_data = load_iris(return_X_y=True)

X = iris_data[0]
y = iris_data[1]

visualize(X, y)

 
 
In [10]:
# HELPER CELL, DO NOT MODIFY
wine_data = load_wine(return_X_y=True)

X = wine_data[0]
y = wine_data[1]

visualize(X, y)

 
 
 
Now you will use PCA on an actual real-world dataset. We will use your implementation of PCA function to reduce the dataset with 99% retained variance and use it to obtain the reduced features. On the reduced dataset, we will use logistic or linear regression and compare results between PCA and non-PCA datasets. Run the following cells to see how PCA works on regression and classification tasks.
In [11]:
# HELPER CELL, DO NOT MODIFY
#load the dataset 
iris = load_iris()

X = iris.data
y = iris.target

print("data shape before PCA ",X.shape)

pca = PCA()
pca.fit(X)

X_pca = pca.transform_rv(X)

print("data shape with PCA ",X_pca.shape)

 
data shape before PCA  (150, 4)
data shape with PCA  (150, 3)

In [12]:
# HELPER CELL, DO NOT MODIFY
# Train, test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, 
                                                    stratify=y, 
                                                    random_state=42)

# Use logistic regression to predict classes for test set
clf = LogisticRegression()
clf.fit(X_train, y_train)
preds = clf.predict_proba(X_test)
print('Accuracy: {:.5f}'.format(accuracy_score(y_test, 
                                                preds.argmax(axis=1))))

 
Accuracy: 0.91111

In [13]:
# HELPER CELL, DO NOT MODIFY
# Train, test splits
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=.3, 
                                                    stratify=y, 
                                                    random_state=42)

# Use logistic regression to predict classes for test set
clf = LogisticRegression()
clf.fit(X_train, y_train)
preds = clf.predict_proba(X_test)
print('Accuracy: {:.5f}'.format(accuracy_score(y_test, 
                                                preds.argmax(axis=1))))

 
Accuracy: 0.91111

In [14]:
# HELPER CELL, DO NOT MODIFY
def apply_regression(X_train, y_train, X_test):
    ridge = Ridge()
    weight = ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    
    return y_pred

In [15]:
# HELPER CELL, DO NOT MODIFY
#load the dataset 
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

print(X.shape, y.shape)

pca = PCA()
pca.fit(X)

X_pca = pca.transform_rv(X)
print("data shape with PCA ",X_pca.shape)

 
(442, 10) (442,)
data shape with PCA  (442, 8)

In [16]:
# HELPER CELL, DO NOT MODIFY
# Train, test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

#Ridge regression without PCA
y_pred = apply_regression(X_train, y_train, X_test)

# calculate RMSE 
rmse_score = np.sqrt(mean_squared_error(y_pred, y_test))
print("rmse score without PCA",rmse_score)

 
rmse score without PCA 55.79391924562032

In [17]:
# HELPER CELL, DO NOT MODIFY
#Ridge regression with PCA
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=.3, random_state=42)

#use Ridge Regression for getting predicted labels
y_pred = apply_regression(X_train,y_train,X_test)

#calculate RMSE 
rmse_score = np.sqrt(mean_squared_error(y_pred, y_test))
print("rmse score with PCA",rmse_score)

 
rmse score with PCA 55.78489213087043

 
For both the tasks above we see an improvement in performance by reducing our dataset with PCA.

Feel free to add other datasets in cell below and play around with what kind of improvement you get with using PCA. There are no points for playing around with other datasets.
In [18]:
######## YOUR CODE BELOW ########

#################################

 
3 Polynomial regression and regularization [55 pts + 20 pts bonus for CS 4641] **[W]** | **[P]**
 
3.1 Regression and regularization implementations [30 pts + 20 pts bonus for CS 4641] **[P]**
We have three methods to fit linear and ridge regression models: 1) close form; 2) gradient descent (GD); 3) Stochastic gradient descent (SGD). For undergraduate students, you are required to implement the closed form for linear regression and for ridge regression, the others 4 methods are bonus parts. For graduate students, you are required to implement all of them. We use the term weight in the following code. Weights and parameters (θθ) have the same meaning here. We used parameters (θθ) in the lecture slides.
In [19]:
class Regression(object):
    
    def __init__(self):
        pass
    
    def rmse(self, pred, label): # [5pts]
        '''
        This is the root mean square error.
        Args:
            pred: numpy array of length N * 1, the prediction of labels
            label: numpy array of length N * 1, the ground truth of labels
        Return:
            a float value
        '''
        rmse = np.sqrt(np.mean((pred - label) ** 2))
        return rmse
        
#         raise NotImplementedError
    
    def construct_polynomial_feats(self, x, degree): # [5pts]
        """
        Args:
            x: numpy array of length N, the 1-D observations
            degree: the max polynomial degree
        Return:
            feat: numpy array of shape Nx(degree+1), remember to include 
            the bias term. feat is in the format of:
            [[1.0, x1, x1^2, x1^3, ....,],
             [1.0, x2, x2^2, x2^3, ....,],
             ......
            ]
        """
        N = x.shape[0]
        feat = np.ones((N))
        for i in range(1, degree + 1):
            x_term = np.power(x, i)
            temp = np.column_stack((feat, x_term))
            feat = temp
        return feat
        
#         raise NotImplementedError


    def predict(self, xtest, weight): # [5pts]
        """
        Args:
            xtest: NxD numpy array, where N is number 
                   of instances and D is the dimensionality of each 
                   instance
            weight: Dx1 numpy array, the weights of linear regression model
        Return:
            prediction: Nx1 numpy array, the predicted labels
        """
        prediction = np.dot(xtest, weight)
        return prediction
        
#         raise NotImplementedError
        
    # =================
    # LINEAR REGRESSION
    # Hints: in the fit function, use close form solution of the linear regression to get weights. 
    # For inverse, you can use numpy linear algebra function  
    # For the predict, you need to use linear combination of data points and their weights (y = theta0*1+theta1*X1+...)

    def linear_fit_closed(self, xtrain, ytrain): # [5pts]
        """
        Args:
            xtrain: N x D numpy array, where N is number of instances and D is the dimensionality of each instance
            ytrain: N x 1 numpy array, the true labels
        Return:
            weight: Dx1 numpy array, the weights of linear regression model
        """
        # term1
        t1 = np.linalg.pinv(np.dot(xtrain.T, xtrain))
        # term2
        t2 = np.dot(t1, xtrain.T)
        weight = np.dot(t2, ytrain)
        return weight
        
#         raise NotImplementedError

    def linear_fit_GD(self, xtrain, ytrain, epochs=5, learning_rate=0.001): # [5pts]
        """
        Args:
            xtrain: NxD numpy array, where N is number 
                    of instances and D is the dimensionality of each 
                    instance
            ytrain: Nx1 numpy array, the true labels
        Return:
            weight: Dx1 numpy array, the weights of linear regression model
        """
        N, D = xtrain.shape[0], xtrain.shape[1]
        weight = np.zeros((D, 1))
        for i in range(epochs):
            temp = weight + learning_rate * (np.dot(xtrain.T, (ytrain - np.dot(xtrain, weight)))) / N
            weight = temp
        return weight
        
#         raise NotImplementedError

    def linear_fit_SGD(self, xtrain, ytrain, epochs=100, learning_rate=0.001): # [5pts]
        """
        Args:
            xtrain: NxD numpy array, where N is number 
                    of instances and D is the dimensionality of each 
                    instance
            ytrain: Nx1 numpy array, the true labels
        Return:
            weight: Dx1 numpy array, the weights of linear regression model
        """
        N, D = xtrain.shape[0], xtrain.shape[1]
        weight = np.zeros((D, 1))
        for i in range(epochs):
            t1 = ytrain - np.dot(xtrain, weight)
            idx = i % N
            t2 = np.dot(xtrain[idx, :].T.reshape((D, 1)), t1[idx].reshape((1, 1)))
            temp = weight + learning_rate * t2
            weight = temp
        return weight
        
#         raise NotImplementedError
        
    # =================
    # RIDGE REGRESSION
        
    def ridge_fit_closed(self, xtrain, ytrain, c_lambda): # [5pts]
        """
        Args:
            xtrain: N x D numpy array, where N is number of instances and D is the dimensionality of each instance
            ytrain: N x 1 numpy array, the true labels
            c_lambda: floating number
        Return:
            weight: Dx1 numpy array, the weights of ridge regression model
        """
        N, D = xtrain.shape[0], xtrain.shape[1]
        ide = np.identity(D)
        ide[0][0] = 0.0           
        # term1
        t1 = np.linalg.pinv(np.dot(xtrain.T, xtrain) + c_lambda * ide)
        # term2
        t2 = np.dot(t1, xtrain.T)
        weight = np.dot(t2, ytrain)
        return weight
        
#         raise NotImplementedError

        
    def ridge_fit_GD(self, xtrain, ytrain, c_lambda, epochs=500, learning_rate=1e-7): # [5pts]
        """
        Args:
            xtrain: NxD numpy array, where N is number 
                    of instances and D is the dimensionality of each 
                    instance
            ytrain: Nx1 numpy array, the true labels
            c_lambda: floating number
        Return:
            weight: Dx1 numpy array, the weights of linear regression model
        """
        N, D = xtrain.shape[0], xtrain.shape[1]
        weight = np.zeros((D, 1))
        for i in range(epochs):
            #term 1
            t1 = np.dot(xtrain.T, (ytrain - np.dot(xtrain, weight)))
            temp = weight + learning_rate * (t1 + c_lambda * weight) / N
            weight = temp
        return weight
        
#         raise NotImplementedError

    def ridge_fit_SGD(self, xtrain, ytrain, c_lambda, epochs=100, learning_rate=0.001): # [5pts]
        """
        Args:
            xtrain: NxD numpy array, where N is number 
                    of instances and D is the dimensionality of each 
                    instance
            ytrain: Nx1 numpy array, the true labels
        Return:
            weight: Dx1 numpy array, the weights of linear regression model
        """
        N, D = xtrain.shape[0], xtrain.shape[1]
        weight = np.zeros((D, 1))
        for i in range(epochs):
            t1 = ytrain - np.dot(xtrain, weight)
            idx = i % N
            t2 = np.dot(xtrain[idx, :].T.reshape((D, 1)), t1[idx].reshape((1, 1)))
            temp = weight + learning_rate * (t2 + c_lambda * weight)
            weight = temp
        return weight
        
#         raise NotImplementedError

    def ridge_cross_validation(self, X, y, kfold=10, c_lambda=100): # [5 pts]
        """
        Args: 
            X : NxD numpy array, where N is the number of instances and D is the dimensionality of each instance
            y : Nx1 numpy array, true labels
            kfold: Number of folds you should take while implementing cross validation.
            c_lambda: Value of regularization constant
        Returns:
            meanErrors: Float average rmse error
        Hint: np.concatenate might be helpful.
        Look at 3.5 to see how this function is being used.
        # For cross validation, use 10-fold method and only use it for your training data (you already have the train_indices to get training data).
        # For the training data, split them in 10 folds which means that use 10 percent of training data for test and 90 percent for training.
        """
        N, D = X.shape[0], X.shape[1]
        size_of_fold = int(N / kfold)
        meanErrors = 0.0
        for i in range(kfold):
            # train model on every fold except ith
            xtrain1 = X[:i * size_of_fold, :]
            xtrain2 = X[(i + 1) * size_of_fold:, :]
            xtrain = np.concatenate((xtrain1, xtrain2))
            ytrain1 = y[:i * size_of_fold]
            ytrain2 = y[(i + 1) * size_of_fold:]
            ytrain = np.concatenate((ytrain1, ytrain2))
            weight = self.ridge_fit_closed(xtrain, ytrain, c_lambda)

            # compute error on ith fold
            xtest = X[i * size_of_fold:(i + 1) * size_of_fold, :]
            ytest = y[i * size_of_fold:(i + 1) * size_of_fold]
            pred = self.predict(xtest, weight)
            error = self.rmse(pred, ytest)

            # add to error
            meanErrors += error

        meanErrors /= kfold
        return meanErrors
            
        
#         raise NotImplementedError

 
3.2 About RMSE [3 pts] **[W]**
Do you know whether this RMSE is good or not? If you don't know, we could normalize our labels between 0 and 1. After normalization, what does it mean when RMSE = 1?

Hint: think of the way that you can enforce your RMSE = 1. Note that you can not change the actual labels to make RMSE = 1.
 
Although we know that we desire a small RMSE value for a good regression model, it might not be the best indicator of a good model just by itself. However, if we normalize our data and labels, the RMSE value should fall between 0 and 1, in which case it is easy to get an idea of whether we have a good model at hand. In a normalized case, RMSE = 1 would mean a very high error, which is not desirable, and the model is not good.
 
3.3 Testing: general functions and linear regression [5 pts] **[W]**
Let's first construct a dataset for polynomial regression.

In this case, we construct the polynomial features up to degree 5.. Each data sample consists of two features [a,b][a,b]. We compute the polynomial features of both a and b in order to yield the vectors [1,a,a2,a3,...adegree][1,a,a2,a3,...adegree] and [1,b,b2,b3,...,bdegree][1,b,b2,b3,...,bdegree]. We train our model with the cartesian product of these polynomial features. The cartesian product generates a new feature vector consisting of all polynomial combinations of the features with degree less than or equal to the specified degree.

For example, if degree = 2, we will have the polynomial features [1,a,a2][1,a,a2] and [1,b,b2][1,b,b2] for the datapoint [a,b][a,b]. The cartesian product of these two vectors will be [1,a,b,ab,a2,b2][1,a,b,ab,a2,b2]. We do not generate a3a3 and b3b3 since their degree is greater than 2 (specified degree).
In [20]:
#helper, do not need to change
POLY_DEGREE = 5
NUM_OBS = 1000

rng = np.random.RandomState(seed=4)

true_weight = -rng.rand((POLY_DEGREE)**2+2, 1)
true_weight[2:, :] = 0
x_all1 = np.linspace(-5, 5, NUM_OBS)
x_all2 = np.linspace(-3, 3, NUM_OBS)
x_all = np.stack((x_all1,x_all2), axis=1)

reg = Regression()
x_all_feat1 = reg.construct_polynomial_feats(x_all[:,0], POLY_DEGREE)
x_all_feat2 = reg.construct_polynomial_feats(x_all[:,1], POLY_DEGREE)

x_cart_flat = []
for i in range(x_all_feat1.shape[0]):
    x1 = x_all_feat1[i]
    x2 = x_all_feat2[i]
    x1_end = x1[-1]
    x2_end = x2[-1]
    x1 = x1[:-1]
    x2 = x2[:-1]
    x3 = np.asarray([[m*n for m in x1] for n in x2])

    x3_flat = np.reshape(x3, (x3.shape[0]**2))
    x3_flat = list(x3_flat)
    x3_flat.append(x1_end)
    x3_flat.append(x2_end)
    x3_flat = np.asarray(x3_flat)
    x_cart_flat.append(x3_flat)
    
x_cart_flat = np.asarray(x_cart_flat)
x_all_feat = np.copy(x_cart_flat)

y_all = np.dot(x_cart_flat, true_weight) + rng.randn(x_all_feat.shape[0], 1) # in the second term, we add noise to data
print(x_all_feat.shape, y_all.shape)

# Note that here we try to produce y_all as our training data
#plot_curve(x_all, y_all) # Data with noise that we are going to predict
#plot_curve(x_all, np.dot(x_cart_flat, true_weight), curve_type='-', color='r', lw=4) # the groundtruth information

indices = rng.permutation(NUM_OBS)

 
(1000, 27) (1000, 1)

In [21]:
# HELPER CELL, DO NOT MODIFY
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

p = np.reshape(np.dot(x_cart_flat, true_weight), (1000,))
print(x_all[:,0].shape, x_all[:,1].shape,p.shape)
ax.plot(x_all[:,0], x_all[:,1], p, c="red",linewidth=4)
ax.scatter(x_all[:,0], x_all[:,1], y_all,s=4)

 
(1000,) (1000,) (1000,)

Out[21]:
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7fd7d738a940
 
 
 
In the figure above, the red curve is the true fuction we want to learn, while the blue dots are the noisy observations. The observations are generated by Y=Xθ+σY=Xθ+σ , where σ∼N(0,1) are i.i.d. generated noise.

Now let's split the data into two parts, namely the training set and test set. The red dots are for training, while the blue dots are for testing.
In [22]:
# HELPER CELL, DO NOT MODIFY
train_indices = indices[:NUM_OBS//2]
test_indices = indices[NUM_OBS//2:]

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

xtrain = x_all[train_indices]
ytrain = y_all[train_indices]
xtest = x_all[test_indices]
ytest = y_all[test_indices]

print(xtrain.shape, xtest.shape, y_all.shape)
ax.scatter(xtrain[:,0], xtrain[:,1], ytrain, c='r',s=4)
ax.scatter(xtest[:,1], xtest[:,1], ytest, c='b',s=4)

 
(500, 2) (500, 2) (1000, 1)

Out[22]:
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7fd7d7228710
 
 
 
Now let's first train using the entire training set, and see how we performs on the test set and how the learned function look like.
In [23]:
# HELPER CELL, DO NOT MODIFY
weight = reg.linear_fit_closed(x_all_feat[train_indices], y_all[train_indices])
y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)

 
test rmse: 0.9589

In [24]:
# HELPER CELL, DO NOT MODIFY
weight = reg.linear_fit_GD(x_all_feat[train_indices], y_all[train_indices], epochs=500000, learning_rate=1e-9)
y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)

 
test rmse: 1.2971

 
And what if we just use the first 10 observations to train?
In [25]:
# HELPER CELL, DO NOT MODIFY
sub_train = train_indices[:10]
weight = reg.linear_fit_closed(x_all_feat[sub_train], y_all[sub_train])
y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)

 
test rmse: 5.2039

 
Did you see a worse performance? Let's take a closer look at what we have learned.
In [26]:
# HELPER CELL, DO NOT MODIFY
y_pred = reg.predict(x_all_feat, weight)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x1 = x_all[:,0]
x2 = x_all[:,0]
y_pred = np.reshape(y_pred, (1000,))
ax.plot(x1, x2, y_pred, color='b', lw=4)

x3 = x_all[sub_train,0]
x4 = x_all[sub_train,1]
ax.scatter(x3, x4, y_all[sub_train], s=100, c='r', marker='x')

y_test_pred = reg.predict(x_all_feat[test_indices], weight)

 
 
 
3.4.1 Testing: ridge regression [5 pts] **[W]**
Now let's try ridge regression. Similarly, undergraduate students need to implement the closed form, and graduate students need to implement all the three methods. We will call the prediction function from linear regression part.
 
Again, let's see what we have learned. You only need to run the cell corresponding to your specific implementation.
In [27]:
# HELPER CELL, DO NOT MODIFY
sub_train = train_indices[:10]
print(x_all_feat[sub_train].shape)
print(y_all[sub_train].shape)
weight = reg.ridge_fit_closed(x_all_feat[sub_train], y_all[sub_train], c_lambda=1000)

y_pred = reg.predict(x_all_feat, weight)

y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x1 = x_all[:,0]
x2 = x_all[:,0]
y_pred = np.reshape(y_pred, (1000,))
ax.plot(x1, x2, y_pred, color='b', lw=4)

x3 = x_all[sub_train,0]
x4 = x_all[sub_train,1]
ax.scatter(x3, x4, y_all[sub_train], s=100, c='r', marker='x')

y_test_pred = reg.predict(x_all_feat[test_indices], weight)

 
(10, 27)
(10, 1)
test rmse: 1.4805

 
 
In [28]:
# HELPER CELL, DO NOT MODIFY
sub_train = train_indices[:10]
weight = reg.ridge_fit_GD(x_all_feat[sub_train], y_all[sub_train], c_lambda=1000, learning_rate=1e-9)

y_pred = reg.predict(x_all_feat, weight)

y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x1 = x_all[:,0]
x2 = x_all[:,0]
y_pred = np.reshape(y_pred, (1000,))
ax.plot(x1, x2, y_pred, color='b', lw=4)

x3 = x_all[sub_train,0]
x4 = x_all[sub_train,1]
ax.scatter(x3, x4, y_all[sub_train], s=100, c='r', marker='x')

y_test_pred = reg.predict(x_all_feat[test_indices], weight)

 
test rmse: 1.6795

 
 
In [29]:
# HELPER CELL, DO NOT MODIFY
sub_train = train_indices[:10]
weight = reg.ridge_fit_SGD(x_all_feat[sub_train], y_all[sub_train], c_lambda=1000, learning_rate=1e-9)

y_pred = reg.predict(x_all_feat, weight)

y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x1 = x_all[:,0]
x2 = x_all[:,0]
y_pred = np.reshape(y_pred, (1000,))
ax.plot(x1, x2, y_pred, color='b', lw=4)

x3 = x_all[sub_train,0]
x4 = x_all[sub_train,1]
ax.scatter(x3, x4, y_all[sub_train], s=100, c='r', marker='x')

y_test_pred = reg.predict(x_all_feat[test_indices], weight)

 
test rmse: 1.6893

 
 
 
3.4.2 Lasso and Ridge Regression [5 pts] **[W]**
We train two linear regression models with different regularizations- one with lasso regularization and the other with ridge regularization. Let w1w1 be the final weight vector for the model with lasso regularization and let w2w2 be the final weight vector for the model with ridge regularization. How do w1w1 and w2w2 differ in terms of sparsity? For ridge regression, how do the weights change with change in lambda?
 
If we train two linear regression models with lasso and ridge regularization, then w1w1, the final weights obtained with lasso regularization will be more sparse than w2w2, the final weight obtained from ridge regularization. This is because the lasso penalty penalizes the weights and makes them zero quickly. On the other hand, while ridge regularization also penalizes the weights, it makes them smaller, but does not necessarily make them zero.

For ridge regression, a higher lambda would mean higher penalty, which would result in smaller weights.
 
3.5 Cross validation [7 pts] **[W]**
Let's use Cross Validation to find the best value for c_lambda in ridge regression.
In [30]:
# We provided 6 possible values for lambda, and you will use them in cross validation.
# For cross validation, use 10-fold method and only use it for your training data (you already have the train_indices to get training data).
# For the training data, split them in 10 folds which means that use 10 percent of training data for test and 90 percent for training.
# At the end for each lambda, you have caluclated 10 rmse and get the mean value of that.
# That's it. Pick up the lambda with the lowest mean value of rmse. 
# Hint: np.concatenate is your friend.
best_lambda = None
best_error = None
kfold = 10
lambda_list = [0.1, 1, 5, 10, 100, 1000]

for lm in lambda_list:
    err = reg.ridge_cross_validation(x_all_feat[train_indices], y_all[train_indices], kfold, lm)
    print('lambda: %.2f' % lm, 'error: %.6f'% err)
    if best_error is None or err < best_error:
        best_error = err
        best_lambda = lm

print('best_lambda: %.2f' % best_lambda)
weight = reg.ridge_fit_closed(x_all_feat[train_indices], y_all[train_indices], c_lambda=10)
y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)  

 
lambda: 0.10 error: 1.015281
lambda: 1.00 error: 1.015235
lambda: 5.00 error: 1.015084
lambda: 10.00 error: 1.014995
lambda: 100.00 error: 1.019286
lambda: 1000.00 error: 1.035747
best_lambda: 10.00
test rmse: 0.9593

 
4. Naive Bayes Classification [20pts]
 
In Bayesian classification, we're interested in finding the probability of a label given some observed feature vector x=[x1,..,xd]x=[x1,..,xd], which we can write as P(y | x1,..,xd)P(y | x1,..,xd). Bayes's theorem tells us how to express this in terms of quantities we can compute more directly:

P(y | x1,..,xd)=P(x1,..,xd | y)P(y)P(x1,..,xd)P(y | x1,..,xd)=P(x1,..,xd | y)P(y)P(x1,..,xd)
The main assumption in Naive Bayes is that, given the label, the observed features are conditionally independent i.e.

P(x1,..,xd | y)=P(x1 | y)×..×P(xd | y)P(x1,..,xd | y)=P(x1 | y)×..×P(xd | y)
Therefore, we can rewrite Bayes rule as

P(y | x1,..,xd)=P(x1 | y)×..×P(xd | y)P(y)P(x1,..,xd)P(y | x1,..,xd)=P(x1 | y)×..×P(xd | y)P(y)P(x1,..,xd)
Training Naive Bayes
One way to train a Naive Bayes classifier is done using frequentist approach to calculate probability, which is simply going over the training data and calculating the frequency of different observations in the training set given different labels. For example,

P(x1=i | y=j)=P(x1=i,y=j)P(y=j)=Number of times in training data x1=i and y=jTotal number of times in training data y=jP(x1=i | y=j)=P(x1=i,y=j)P(y=j)=Number of times in training data x1=i and y=jTotal number of times in training data y=j
Testing Naive Bayes
During the testing phase, we try to estimate the probability of a label given an observed feature vector. We combine the probabilities computed from training data to estimate the probability of a given label. For example, if we are trying to decide between two labels y1y1 and y2y2, then we compute the ratio of the posterior probabilities for each label:

P(y1 | x1,..,xd)P(y2 | x1,..,xd)=P(x1,..,xd | y1)P(x1,..,xd | y2)P(y1)P(y2)=P(x1 | y1)×..×P(xd | y1)P(y1)P(x1 | y2)×..×P(xd | y2)P(y2)P(y1 | x1,..,xd)P(y2 | x1,..,xd)=P(x1,..,xd | y1)P(x1,..,xd | y2)P(y1)P(y2)=P(x1 | y1)×..×P(xd | y1)P(y1)P(x1 | y2)×..×P(xd | y2)P(y2)
All we need now is to compute P(x1|yi),..,P(xd | yi)P(x1|yi),..,P(xd | yi) and P(yi)P(yi) for each label by pluging in the numbers we got during training. The label with the higher posterior probabilities is the one that is selected.
 
4.1 Bayes in Advertisements [5pts] **[W]**
 
An advertising agency want to analyze the advertisements for a product. They want to target people from all age groups. They show 5 advertisement videos to 200 people. The following is the data on how many people from each group liked which videos.

Age Group
Total
Video 1
Video 2
Video 3
Video 4
Video 5
16 - 35
100
20
30
60
15
90
36 - 55
60
15
50
30
20
40
Above 55
40
35
30
10
10
5
Total
200
70
110
100
45
135
A new consumer is shown the videos and he likes videos 2, 3 and 5. Which age group is he most likely to belong to?

Note: You can assume that each person provides opinion about each video independently i.e. Person 1 liking Video 1 has no effect on their assesment of Video 2.
 
Given, the person likes videos 2, 3, and 5. For calculation purposes, we will assume that the person does not like videos 1 and 4. Now, we will find the probability of each age group liking videos 2, 3 and 5, and disliking videos 1 and 4, the age group with the highest probability will be the age group this new person belongs to.

First, let's find the prior probabilities of each age group. We have:

P(age=16−35)=100200=0.5P(age=16−35)=100200=0.5
P(age=36−55)=60200=0.3P(age=36−55)=60200=0.3
P(age=55+)=40200=0.2P(age=55+)=40200=0.2
Now, let this new person be represented by

x={x1=0,x2=1,x3=1,x4=0,x5=1}x={x1=0,x2=1,x3=1,x4=0,x5=1}
Now, the posterior probabilities can be calculated by Bayes' rule as

P(age=16−35|x)=P(x|age=16−35)∗P(age=16−35)P(x)P(age=16−35|x)=P(x|age=16−35)∗P(age=16−35)P(x)
P(age=36−55|x)=P(x|age=36−55)∗P(age=36−55)P(x)P(age=36−55|x)=P(x|age=36−55)∗P(age=36−55)P(x)
P(age=55+|x)=P(x|age=55+)∗P(age=55+)P(x)P(age=55+|x)=P(x|age=55+)∗P(age=55+)P(x)
Now, for comparison, we can get rid of the denominator as it is the same in all the three equations. Thus, calculating the numerator for the above equations gives us:

P(x|age=16−35)∗P(age=16−35)P(x2|age=16−35)∗P(x3|age=16−35)∗P(x4|age=16−35)∗P(x5|age=16−35)∗P(age=16−35)=P(x1|age=16−35)∗=80∗30∗60∗85∗901005∗0.5=0.11∗0.5=0.055P(x|age=16−35)∗P(age=16−35)=P(x1|age=16−35)∗P(x2|age=16−35)∗P(x3|age=16−35)∗P(x4|age=16−35)∗P(x5|age=16−35)∗P(age=16−35)=80∗30∗60∗85∗901005∗0.5=0.11∗0.5=0.055
P(x|age=36−55)∗P(age=36−55)P(x2|age=36−55)∗P(x3|age=36−55)∗P(x4|age=36−55)∗P(x5|age=36−55)∗P(age=36−55)=P(x1|age=36−55)∗=45∗50∗30∗40∗40605∗0.3=0.1389∗0.3=0.0417P(x|age=36−55)∗P(age=36−55)=P(x1|age=36−55)∗P(x2|age=36−55)∗P(x3|age=36−55)∗P(x4|age=36−55)∗P(x5|age=36−55)∗P(age=36−55)=45∗50∗30∗40∗40605∗0.3=0.1389∗0.3=0.0417
P(x|age=55+)∗P(age=55+)P(x2|age=55+)∗P(x3|age=55+)∗P(x4|age=55+)∗P(x5|age=55+)∗P(age=55+)=P(x1|age=55+)∗=5∗30∗10∗30∗5405∗0.2=0.002∗0.2=0.0004P(x|age=55+)∗P(age=55+)=P(x1|age=55+)∗P(x2|age=55+)∗P(x3|age=55+)∗P(x4|age=55+)∗P(x5|age=55+)∗P(age=55+)=5∗30∗10∗30∗5405∗0.2=0.002∗0.2=0.0004
Since the probability of age group 16-35 is highest, the new person belongs to this age group.
 
4.2 The Federalist Papers **[P]**
 
The Federalist Papers were a series of essays written in 1787–1788 meant to persuade the citizens of the State of New York to ratify the Constitution and which were published anonymously under the pseudonym “Publius”. In later years the authors were revealed as Alexander Hamilton, John Jay, and James Madison. However, there is some disagreement as to who wrote which essays. Hamilton wrote a list of which essays he had authored only days before being killed in a duel with then Vice President Aaron Burr. Madison wrote his own list many years later, which is in conflict with Hamilton’s list on 12 of the essays. Since by this point the two (who were once close friends) had become bitter rivals, historians have long been unsure as to the reliability of both lists. We will try to settle this dispute using a simple Naive Bayes classifier.

The code which is provided loads the documents and builds a “bag of words” representation of each document. Your task is to complete the missing portions of the code and to determine your best guess as to who wrote each of the 12 disputed essays. (Hint: H and M are the labels that stand for Hamilton and Madison, while the label D stands for disputed for the papers we are trying to label in our data. Our job here is to define whether D essays belong to H or M using Naive Bayes. Note that the label D for disputed, is completely unrelated to the feature dimension D which is an integer).

_priors_ratio function calculates the ratio of class probabilities of document belonging to Hamilton as compared to Madison. We do this based on word counts rather than document counts.

_likelihood_ratio function calculates the ratio of word probablities given the author it belonged to.

Note 1: In _likelihood_ratio() add one to each word count so as to avoid issues with zero word count. This is known as Add-1 smoothing. It is a type of additive smoothing.
In [31]:
class NaiveBayes(object):

    def __init__(self):
        # load Documents
        x = open('fedpapers_split.txt').read()
        papers = json.loads(x)

        # split Documents
        papersH = papers[0] # papers by Hamilton 
        papersM = papers[1] # papers by Madison
        papersD = papers[2] # disputed papers

        # Number of Documents for H, M and D
        nH = len(papersH)
        nM = len(papersM)
        nD = len(papersD)
        
        '''To ignore certain common words in English that might skew your model, we add them to the stop words 
        list below. You may want to experiment by choosing your own list of stop words, but be sure to keep 
        'HAMILTON' and 'MADISON' in this list at a minimum, as their names appear in the text of the papers 
        and leaving them in could lead to unpredictable results '''

        stop_words = text.ENGLISH_STOP_WORDS.union({'HAMILTON','MADISON'})
        #stop_words = {'HAMILTON','MADISON'}
        # Form bag of words model using words used at least 10 times
        vectorizer = text.CountVectorizer(stop_words=stop_words,min_df=10)
        X = vectorizer.fit_transform(papersH+papersM+papersD).toarray()

        '''To visualize the full list of words remaining after filtering out stop words and words used less 
        than min_df times uncomment the following line'''
        #print(vectorizer.vocabulary_)

        # split word counts into separate matrices
        self.XH, self.XM, self.XD = X[:nH,:], X[nH:nH+nM,:], X[nH+nM:,:]
        

    def _likelihood_ratio(self, XH, XM): # [5pts]
        '''
        Args:
            XH: nH x D where nH is the number of documents that we have for Hamilton,
                while D is the number of features (we use the word count as the feature)
            XM: nM x D where nM is the number of documents that we have for Madison,
                while D is the number of features (we use the word count as the feature)
        Return:
            fratio: 1 x D vector of the likelihood ratio of different words (Hamilton/Madison)
        '''
        # Estimate probability of each word in vocabulary being used by Hamilton 
        
        # Estimate probability of each word in vocabulary being used by Madison 
        
        # Compute ratio of these probabilities
        # Hamilton
        total_freq_H = np.sum(XH, axis=0, keepdims=True) + 1

        # Madison
        total_freq_M = np.sum(XM, axis=0, keepdims=True) + 1

        fratio = (total_freq_H * np.sum(total_freq_M)) / (total_freq_M * np.sum(total_freq_H))
        return fratio
        
#         raise NotImplementedError
    
    def _priors_ratio(self, XH, XM): # [5pts]
        '''
        Args:
            XH: nH x D where nH is the number of documents that we have for Hamilton,
                while D is the number of features (we use the word count as the feature)
            XM: nM x D where nM is the number of documents that we have for Madison,
                while D is the number of features (we use the word count as the feature)
        Return:
            pr: prior ratio of (Hamilton/Madison)
        '''
        words_in_H = np.sum(XH)
        words_in_M = np.sum(XM)
        prior_ratio = float(words_in_H / words_in_M)
        return prior_ratio
        
        
#         raise NotImplementedError

    def classify_disputed(self, fratio, pratio, XD): # [5pts]
        '''
        Args:
            fratio: 1 x D vector of ratio of likelihoods of different words
            pratio: 1 x 1 number
            XD: 12 x D bag of words representation of the 12 disputed documents (D = 1307 which are the number of features for each document)
        Return:
             1 x 12 list, each entry is H to indicate Hamilton or M to indicate Madison for the corrsponding document
        '''   
        res = []
        for i in range(12):
            doc = XD[i]
            t1 = np.power(fratio, doc)
            prob_of_H = np.prod(t1) * pratio
            if prob_of_H 0.5:
                res.append('H')
            else:
                res.append('M')
        return res
#         raise NotImplementedError

In [32]:
# HELPER CELL, DO NOT MODIFY
NB = NaiveBayes()
fratio = NB._likelihood_ratio(NB.XH, NB.XM)
pratio = NB._priors_ratio(NB.XH, NB.XM)
resolved = NB.classify_disputed(fratio, pratio, NB.XD)
    
print(resolved)

 
['M', 'M', 'M', 'M', 'H', 'H', 'M', 'H', 'H', 'M', 'M', 'M']

 
5 Noise in PCA and Linear Regression (15 Pts) **[W]**
 
Both PCA and least squares regression can be viewed as algorithms for inferring (linear) relationships among data variables. In this part of the assignment, you will develop some intuition for the differences between these two approaches, and an understanding of the settings that are better suited to using PCA or better suited to using the least squares fit.

The high level bit is that PCA is useful when there is a set of latent (hidden/underlying) variables, and all the coordinates of your data are linear combinations (plus noise) of those variables. The least squares fit is useful when you have direct access to the independent variables, so any noisy coordinates are linear combinations (plus noise) of known variables.

5.1 Slope Functions (5 Pts) **[W]**
In the following cell:

For this function assume that X is the first feature and Y is the second feature for the data. Write a function pca_slope that takes a vector X of xi’s and a vector Y of yi’s and returns the slope of the first component of the PCA.
Write a function linear_regression_slope that takes X and y and returns the slope of the least squares fit. (Hint: since X is one dimensional, this takes a particularly simple form1 ((X−X¯¯¯¯)⋅(Y−Y¯¯¯¯))/∥X−X¯¯¯¯∥22((X−X¯)⋅(Y−Y¯))/‖X−X¯‖22, where X is the mean value of X.)
In later subparts, we consider the case where our data consists of noisy measurements of x and y. For each part, we will evaluate the quality of the relationship recovered by PCA, and that recovered by standard least squares regression.

As a reminder, least squares regression minimizes the squared error of the dependent variable from its prediction. Namely, given (xi,yi)(xi,yi) pairs, least squares returns the line l(x)l(x) that minimizes ∑i(yi−l(xi))2∑i(yi−l(xi))2.

Note 1: You should use the PCA and Linear Regression implementations from Q2 and Q3 in this question. Do not use any kind of regularization for Linear Regression.
In [33]:
def pca_slope(x, y):
    """
    Calculates the slope of the first principal component given by PCA

    Args: 
        x: (N,) vector of feature x
        y: (N,) vector of feature y
    Return:
        slope: slope of the first principal component
    """
    N = x.shape[0]
    
    #make dataset
    x = x.reshape((N, 1))
    y = y.reshape((N, 1))
    data = np.hstack((x, y))
    
    #apply PCA
    obj = PCA()
    obj.fit(data)
    
    #slope of first principal component
    slope = obj.V[0, 1] / obj.V[0, 0]
    
    return slope
    
#     raise NotImplementedError
    

def lr_slope(X, y):
    """
    Calculates the slope of the best fit as given by Linear Regression
    
    For this function don't use any regularization

    Args: 
        X: N*1 array corresponding to a dataset
        y: N*1 array of labels y
    Return:
        slope: slope of the best fit
    """
    obj = Regression()
    slope = obj.linear_fit_closed(X, y)[0]
    
    return slope
    
#     raise NotImplementedError

 
We will consider a simple example with two variables, x and y, where the true relationship between the variables is y = 2x. Our goal is to recover this relationship—namely, recover the coefficient “2”. We set X = [0, .01, .02, .03, . . . , 1] and y = 2x. Make sure both functions return 2.
In [34]:
# HELPER CELL, DO NOT MODIFY
x = np.arange(0, 1, 0.01)
y = 2 * np.arange(0, 1, 0.01)

print("Slope of first principal component", pca_slope(x, y))

print("Slope of best linear fit", lr_slope(x[:, None], y))

plt.scatter(x, y)
plt.xlabel("x")
plt.ylabel("y")
plt.show()

 
Slope of first principal component 2.0
Slope of best linear fit 2.0

 
 
 
5.2 Analysis Setup (5 Pts) **[W]**
Error in y
 
In this subpart, we consider the setting where our data consists of the actual values of xx, and noisy estimates of yy. Run the following cell to see how the data looks when there is error in yy.
In [35]:
# HELPER CELL, DO NOT MODIFY
base = np.arange(0.001, 1, 0.001)
c = 0.5
X = base
y = 2 * base + np.random.normal(loc=[0], scale=c, size=base.shape)

plt.scatter(X, y)
plt.xlabel("x")
plt.ylabel("y")
plt.show()

 
 
 
In the subsequent cell implement the following:

Fix X=[x1,x2,...,x1000]=[.001,.002,.003,...,1]X=[x1,x2,...,x1000]=[.001,.002,.003,...,1].
For a given noise level cc, set y^i∼2xi+N(0,c)=2i/1000+N(0,c)y^i∼2xi+N(0,c)=2i/1000+N(0,c), and Y^=[y^1,y^2,...,y^1000]Y^=[y^1,y^2,...,y^1000]
Make a scatter plot with c on the horizontal axis, and the output of pca-slope and ls-slope on the vertical axis.
For each cc in [0,0.05,0.1,...,.95,1.0][0,0.05,0.1,...,.95,1.0], take a sample Y^Y^, plot the output of pca-recover as a red dot, and the output of ls-recover as a blue dot. Repeat 30 times. You should end up with a plot of 1260 dots, in 21 columns of 60, half red and half blue.
In [36]:
pca_slope_values = []
linreg_slope_values = []
c_values = []

for i in range(30):
    for c in np.arange(0, 1, 0.05):
        ####### YOUR CODE BELOW #######
        #step1
        X = base
        
        #step2
        y = 2 * base + np.random.normal(scale=c, size=base.shape)
        
        #step3
        slope1 = pca_slope(X, y)
        slope2 = lr_slope(X.reshape(-1, 1), y)
        pca_slope_values.append(slope1)
        linreg_slope_values.append(slope2) 
        ###############################

        c_values.append(c)

plt.scatter(c_values, pca_slope_values, c='r')
plt.scatter(c_values, linreg_slope_values, c='b')
plt.xlabel("c")
plt.ylabel("slope")
plt.show()

 
 
 
Error in x and y
 
We now examine the case where our data consists of noisy estimates of both xx and yy. Run the following cell to see how the data looks when there is error in both.
In [37]:
# HELPER CELL, DO NOT MODIFY
base = np.arange(0.001, 1, 0.001)
c = 0.5
X = base + np.random.normal(loc=[0], scale=c, size=base.shape) * 0.5
y = 2 * base + np.random.normal(loc=[0], scale=c, size=base.shape) * 0.5

plt.scatter(X, y)
plt.xlabel("x")
plt.ylabel("y")
plt.show()

 
 
 
In the subsequent cell implement the following:

For a given noise level c, let x^i∼xi+N(0,c)=i/1000+N(0,c)x^i∼xi+N(0,c)=i/1000+N(0,c), and X^=[x^1,x^2,....x^1000]X^=[x^1,x^2,....x^1000]
For the same noise level c, set y^i∼2xi+N(0,c)=2i/1000+N(0,c)y^i∼2xi+N(0,c)=2i/1000+N(0,c), and Y^=[y^1,y^2,...,y^1000]Y^=[y^1,y^2,...,y^1000]
Make a scatter plot with c on the horizontal axis, and the output of pca-slope and ls-slope on the vertical axis. For each cc in [0,0.05,0.1,...,.95,1.0][0,0.05,0.1,...,.95,1.0], take a sample of both X^X^ and Y^Y^, plot the output of pca-recover as a red dot, and the output of ls-recover as a blue dot. Repeat 30 times. You should end up with a plot of 1260 dots, in 21 columns of 60, half red and half blue.
In [38]:
pca_slope_values = []
linreg_slope_values = []
c_values = []

for i in range(30):
    for c in np.arange(0, 1, 0.05):
        ####### YOUR CODE BELOW #######       
        #step2
        X = base + np.random.normal(scale=c, size=base.shape)
        y = 2 * base + np.random.normal(scale=c, size=base.shape)
        
        #step3
        slope1 = pca_slope(X, y)
        slope2 = lr_slope(X.reshape(-1, 1), y)
        pca_slope_values.append(slope1)
        linreg_slope_values.append(slope2)         
        ###############################

        c_values.append(c)

plt.scatter(c_values, pca_slope_values, c='r')
plt.scatter(c_values, linreg_slope_values, c='b')
plt.xlabel("c")
plt.ylabel("slope")
plt.show()

 
 
 
5.3. Analysis (5 Pts) **[W]**
Based on your observations from previous subsections answer the following questions about the two cases (error in XX and error in both XX and YY) in 2-3 lines.

Note:

The closer the value of slope to actual slope ("2" here) the better the algorithm is performing.
You don't need to provide a proof for this question.
Questions:

Which case does PCA perform worse in? Why does PCA perform worse in this case?
Why does PCA perform better in the other case?
Which case does Linear Regression perform well? Why does Linear Regression perform well in this case?
 
PCA performs worse in the case when there is error only in Y. Since y has noise, PCA makes errors when in tries to reduce the orthogonal distance between x and y, as the noise in y is also included.
PCA then performs better in the other case, when there is error in both x and y, as noise is present along both axes, which increases the correlation, and PCA learns better.
Linear regression on the other hand performs better when there's error only in Y, as the linear regression loss takes into consideration the gaussian noise in the prediction, which is along y-axis.
 
6 Manifold learning [Bonus for everyone][30 pts] **[W]**|**[P]**
While PCA is wonderful tool for dimensionality reduction it does not work very well when dealing with non-linear relationships between features. Manifold learning is a class of algorithms that can be used to reduce dimensions in complex high-dimensional datasets. While a number of methods have been proposed to perform this type of operation, we will focus on Isomap. Isomap has been shown to be sensitive to data noise amongst other issues, however it has been shown to perform reasonably well for real world data. The algorithm consists of two main steps: first computing a manifold distance matrix, followed by classical mutidimensional scaling. You will be creating your implementation of Isomap. In order to do so, you must read the original paper "A Global Geometric Framework for Nonlinear Dimensionality Reduction" by Tenenbaum et al. (2000), which outlines the method. You are also encouraged to read this general survey of manifold learning by Cayton (2005), where the original algorithm is further explained in a more detailed yet simplified fashion.
 
6.1 Implementation [23 pts] **[P]**
 
6.1.1 pairwise distance [3pts] **[P]**
In this section, you are asked to implement pairwise_dist function.

Given X∈RNxDX∈RNxD and Y∈RMxDY∈RMxD, obtain the pairwise distance matrix dist∈RNxMdist∈RNxM using the euclidean distance metric, where disti,j=||Xi−Yj||2disti,j=||Xi−Yj||2.

DO NOT USE FOR LOOP in your implementation -- they are slow and will make your code too slow to pass our grader. Use array broadcasting instead.
 
6.1.2 manifold distance matrix [10pts] **[P]**
In this section, you need to implement manifold_distance_matrix function.

Given X∈RNxDX∈RNxD and the number of the clusters KK, compute the distance matrix dist∈RNxMdist∈RNxM, where the values obey the following equations:

distij={||Xi−Yj||2,Shortest Path Distance, j∈Neighbour(i), j∉Neighbour(i).distij={||Xi−Yj||2, j∈Neighbour(i),Shortest Path Distance, j∉Neighbour(i).
Hint: For doing this part, you can partly utilize the scipy toolbox. After creating your k-nearest weighted neighbors adjacency matrix, you can convert it to a sparse graph object csr_matrix and utilize the pre-built Floyd-Warshall algorithm to compute the manifold distance matrix.
 
6.1.3 multidimensional scaling [10pts] **[P]**
In this section, you need to accomplish the multimensional_scaling part.

Given the computed distance matrix distijdistij and the size of the new reduced feature space dd, you need to return the X embedding of the new feature space.

Closed Form of the Gram Matrix with Centering:

We can now succinctly state the closed matrix form of BB by making use of the centering matrix:



B=−12CnD2CnB=−12CnD2Cn

Note: CnCn is the centering matrix with Cn=In−1n11TCn=In−1n11T and from the original distance matrix we have D2D2 = matrix with entries d2ijdij2

Find eigenvalues and eigenvectors of matrix B Since the gram matrix BB is a real symmetric, positive definite matrix, we know that it will have real eigenvalues and we can use the following eigendecomposition of B in order to find an expression for our output configuration:



B=VΛVT=(Λ12VT)T(Λ12VT)=XXT(from original def. of B)B=VΛVT=(Λ12VT)T(Λ12VT)=XXT(from original def. of B)

and therefore we have

X=VΛ−−√X=VΛ

where the eigenvalues are given by diagonal matrix Λ=diag(λ1,…,λn)Λ=diag⁡(λ1,…,λn) and the eigenvectors are given by the following matrix with the columns set as the eigenvectors V=(v1,…,vn)TV=(v1,…,vn)T

Find coordinates of output configuration We can now define a k-dimensional configuration by choosing the largest k eigenvalues and the corresponding eigenvectors from k columns of V:



Xk=VkΛk−−√Xk=VkΛk
where ΛkΛk is the k x k diagonal submatrix of ΛΛ and VkVk is the n x k submatrix of VV.
 
In the cell below implement the code for section 5.1
In [39]:
class Isomap(object):
    def __init__(self): # No need to implement
        pass
    
    def pairwise_dist(self, x, y): # [3 pts]
        """
        Args:
            x: N x D numpy array
            y: M x D numpy array
        Return:
                dist: N x M array, where dist2[i, j] is the euclidean distance between 
                x[i, :] and y[j, :]
                """
    
        raise NotImplementedError
    
    def manifold_distance_matrix(self, x, K): # [10 pts]
        """
        Args:
            x: N x D numpy array
        Return:
            dist_matrix: N x N numpy array, where dist_matrix[i, j] is the euclidean distance between points if j is in the neighborhood N(i)
            or comp_adj = shortest path distance if j is not in the neighborhood N(i).
        Hint: After creating your k-nearest weighted neighbors adjacency matrix, you can convert it to a sparse graph
        object csr_matrix (https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html) and utilize
        the pre-built Floyd-Warshall algorithm (https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csgraph.floyd_warshall.html)
        to compute the manifold distance matrix.
        """

        raise NotImplementedError

    
    def multidimensional_scaling(self, dist_matrix, d): # [10 pts]
        """
        Args:
            dist_matrix: N x N numpy array, the manifold distance matrix
            d: integer, size of the new reduced feature space 
        Return:
            S: N x d numpy array, X embedding into new feature space.
        """
        
        raise NotImplementedError

    # you do not need to change this
    def __call__(self, data, K, d):
        # get the manifold distance matrix
        W = self.manifold_distance_matrix(data, K)
        # compute the multidimensional scaling embedding
        emb_X = self.multidimensional_scaling(W, d)
        return emb_X

 
6.2 Examples for different datasets [7pts] **[W]**
Apply your implementation of Isomap for some of the datasets (e.g. MNIST and Iris). Discuss how the results compare to PCA.
In [40]:
# HELPER CELL, DO NOT MODIFY
# example MNIST data
mnist = load_digits()
proj = Isomap()(mnist.data, 10, 2)
plt.scatter(proj[:, 0], proj[:, 1], c=mnist.target, cmap=plt.cm.get_cmap('jet', 10))
plt.colorbar(ticks=range(10))

 
---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-40-a9ee22e4c0ae in <module
      2 # example MNIST data
      3 mnist = load_digits()
---- 4 proj = Isomap()(mnist.data, 10, 2)
      5 plt.scatter(proj[:, 0], proj[:, 1], c=mnist.target, cmap=plt.cm.get_cmap('jet', 10))
      6 plt.colorbar(ticks=range(10))

<ipython-input-39-352e881a55a4 in __call__(self, data, K, d)
     45     def __call__(self, data, K, d):
     46         # get the manifold distance matrix
--- 47         W = self.manifold_distance_matrix(data, K)
     48         # compute the multidimensional scaling embedding
     49         emb_X = self.multidimensional_scaling(W, d)

<ipython-input-39-352e881a55a4 in manifold_distance_matrix(self, x, K)
     28         """
     29 
--- 30         raise NotImplementedError
     31 
     32 

NotImplementedError: 
 
7 Feature Selection Implementation [No Points] **[P]**
Note: This is a fun question for you to learn about Feature Reduction. No points will be awarded for it. If you have time please go over it. It would be beneficial for your project.

Implement a method to find the final list of significant features due to forward selection and backward elimination.

Forward Selection:
In forward selection, we start with a null model, start fitting the model with one individual feature at a time, and select the feature with the minimum p-value. We continue to do this until we have a set of features where one feature's p-value is less than the confidence level.

Steps to implement it:

1: Choose a significance level (given to you).
2: Fit all possible simple regression models by considering one feature at a time.
3: Select the feature with the lowest p-value.
4: Fit all possible models with one extra feature added to the previously selected feature(s).
5: Select the feature with the minimum p-value again. if p_value < significance, go to Step 4. Otherwise, terminate.
Backward Elimination:
In backward elimination, we start with a full model, and then remove the insignificant feature with the highest p-value (that is greater than the significance level). We continue to do this until we have a final set of significant features.

Steps to implement it:

1: Choose a significance level (given to you).
2: Fit a full model including all the features.
3: Select the feature with the highest p-value. If (p_value significance level), go to Step 4, otherwise terminate.
4: Remove the feature under consideration.
5: Fit a model without this feature. Repeat entire process from Step 3 onwards.
TIP 1: The p-value is known as the observed significance value for a test hypothesis. It tests all the assumptions about how the data was generated in the model, not just the target hypothesis it was supposed to test. Some more information about p-values can be found here: https://towardsdatascience.com/what-is-a-p-value-b9e6c207247f

TIP 2: For this function, you will have to install statsmodels if not installed already. Run 'pip install statsmodels' in command line/terminal. statsmodels is a Python module that provides classes and functions for the estimation of many different statistical models, as well as for conducting statistical tests, and statistical data exploration. You will have to use this library to choose a regression model to fit your data against. Some more information about this module can be found here: https://www.statsmodels.org/stable/index.html
In [ ]:
import pandas as pd
import statsmodels.api as sm

class FeatureReduction(object):
    def __init__(self):
        pass

    @staticmethod
    def forward_selection(data, target, significance_level=0.05):
        '''
        Args:
            data: data frame that contains the feature matrix
            target: target feature to search to generate significant features
            significance_level: the probability of the event occuring by chance
        Return:
            forward_list: list containing significant features
        '''
        
        raise NotImplementedError

    @staticmethod
    def backward_elimination(data, target, significance_level = 0.05):
        '''
        Args:
            data: data frame that contains the feature matrix
            target: target feature to search to generate significant features
            significance_level: the probability of the event occuring by chance
        Return:
            backward_list: list containing significant features
        '''
        
        raise NotImplementedError

In [ ]:
# HELPER CELL, DO NOT MODIFY
boston = load_boston()
bos = pd.DataFrame(boston.data, columns = boston.feature_names)
bos['Price'] = boston.target
X = bos.drop("Price", 1)       # feature matrix 
y = bos['Price']               # target feature
featurereduction = FeatureReduction()
#Run the functions to make sure two lists are generated, one for each method
print("Features selected by forward selection:", featurereduction.forward_selection(X, y))
print("Features selected by backward selection:", featurereduction.backward_elimination(X, y))
 Image compression with SVD [30 pts] **[P]** **[W]**
 
Load images data and plot
In [2]:
# HELPER CELL, DO NOT MODIFY
# load Image
image = plt.imread("hw3_image_2.jpg")/255.
#plot image
fig = plt.figure(figsize=(10,10))
plt.imshow(image)

Out[2]:
<matplotlib.image.AxesImage at 0x7fd7f813eef0
 
 
In [3]:
# HELPER CELL, DO NOT MODIFY
def rgb2gray(rgb):   
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

fig = plt.figure(figsize=(10, 10))
# plot several images
plt.imshow(rgb2gray(image), cmap=plt.cm.bone)

Out[3]:
<matplotlib.image.AxesImage at 0x7fd7f8c6b0f0
 
 
 
1.1 Image compression [20pts] **[P]**
SVD is a dimensionality reduction technique that allows us to compress images by throwing away the least important information.

Higher singular values capture greater variance and thus capture greater information from the corresponding singular vector. To perform image compression, apply SVD on each matrix and get rid of the small singular values to compress the image. The loss of information through this process is negligible and the difference between the images can hardly be spotted. For example, the variance captured by the first component

σ1∑ni=1σiσ1∑i=1nσi
where σiσi is the ithith singular value. You need to finish the following functions to do SVD and then reconstruct the image by components.

Hint 1: http://timbaumann.info/svd-image-compression-demo/ is an useful article on image compression.
In [4]:
class ImgCompression(object):
    def __init__(self):
        pass

    def svd(self, X): # [5pts]
        """
        Do SVD. You could use numpy SVD.
        Your function should be able to handle black and white
        images (N*D arrays) as well as color images (N*D*3 arrays)
        In the image compression, we assume that each colum of the image is a feature. Image is the matrix X.
        Args:
            X: N * D array corresponding to an image (N * D * 3 if color image)
        Return:
            U: N * N for black and white images / N * N * 3 for color images
            S: min(N, D) * 1 for black and white images / min(N, D) * 3 for color images
            V: D * D for black and white images / D * D * 3 for color images
        """
        # BnW image
        if len(X.shape) == 2:
            U, S, V = np.linalg.svd(X)
        else:
            # colour image
            N, D = X.shape[0], X.shape[1]
            min_ND = min(N, D)

            # channel0
            U0, S0, V0 = np.linalg.svd(X[:, :, 0])
            S0 = S0.reshape((min_ND, 1))

            # channel1
            U1, S1, V1 = np.linalg.svd(X[:, :, 1])
            S1 = S1.reshape((min_ND, 1))

            # channel2
            U2, S2, V2 = np.linalg.svd(X[:, :, 2])
            S2 = S2.reshape((min_ND, 1))

            #combining
            U = np.array([U0, U1, U2])
            U = U.transpose(1, 2, 0)

            S = np.concatenate((S0, S1, S2), axis=1)

            V = np.array([V0, V1, V2])
            V = V.transpose(1, 2, 0)
        return U, S, V
        
#         raise NotImplementedError


    def rebuild_svd(self, U, S, V, k): # [5pts]
        """
        Rebuild SVD by k componments.
        Args:
            U: N*N (*3 for color images)
            S: min(N, D)*1 (*3 for color images)
            V: D*D (*3 for color images)
            k: int corresponding to number of components
        Return:
            Xrebuild: N*D array of reconstructed image (N*D*3 if color image)

        Hint: numpy.matmul may be helpful for reconstructing color images
        """
        if len(U.shape) == 2:
            # BnW image
            # term1
            t1 = np.matmul(U[:, :k], np.diag(S)[:k, :k])
            # term2
            Xrebuild = np.matmul(t1, V[:k, :])
        else:
            # colour image
            N = U.shape[0]
            D = V.shape[0]

            # channel 0
            U0 = U[:, :, 0]
            S0 = S[:, 0]
            V0 = V[:, :, 0]
            # term1
            t1 = np.matmul(U0[:, :k], np.diag(S0)[:k, :k])
            # term2
            Xrebuild0 = np.matmul(t1, V0[:k, :])

            # channel 1
            U1 = U[:, :, 1]
            S1 = S[:, 1]
            V1 = V[:, :, 1]
            # term1
            t1 = np.matmul(U1[:, :k], np.diag(S1)[:k, :k])
            # term2
            Xrebuild1 = np.matmul(t1, V1[:k, :])

            # channel 2
            U2 = U[:, :, 2]
            S2 = S[:, 2]
            V2 = V[:, :, 2]
            # term1
            t1 = np.matmul(U2[:, :k], np.diag(S2)[:k, :k])
            # term2
            Xrebuild2 = np.matmul(t1, V2[:k, :])

            # combining
            Xrebuild = np.array([Xrebuild0, Xrebuild1, Xrebuild2])
            Xrebuild = Xrebuild.transpose(1, 2, 0)

        return Xrebuild   

#         raise NotImplementedError

    def compression_ratio(self, X, k): # [5pts]
        """
        Compute compression of an image: (num stored values in compressed)/(num stored values in original)
        Args:
            X: N * D array corresponding to an image (N * D * 3 if color image)
            k: int corresponding to number of components
        Return:
            compression_ratio: float of proportion of storage used by compressed image
        """
        if len(X.shape) == 2:
            # BnW image
            num_orig = X.shape[0] * X.shape[1]
            num_compress = k * (1 + X.shape[0] + X.shape[1])
        else:
            # colour image
            num_orig = X.shape[0] * X.shape[1] * X.shape[2]
            num_compress = k * (1 + X.shape[0] + X.shape[1]) * X.shape[2]
            
        compression_ratio = num_compress * 1.0 / num_orig
        return compression_ratio
        
#         raise NotImplementedError

    def recovered_variance_proportion(self, S, k): # [5pts]
        """
        Compute the proportion of the variance in the original matrix recovered by a rank-k approximation

        Args:
           S: min(N, D)*1 (*3 for color images) of singular values for the image
           k: int, rank of approximation
        Return:
           recovered_var: float (array of 3 floats for color image) corresponding to proportion of recovered variance
        """
        S2 = S ** 2
        if len(S.shape) == 1:
            # BnW image
            recovered_var = np.sum(S2[:k]) / np.sum(S2)
        else:
            # colour image
            # channel0
            recovered_var0 = np.sum(S2[:k, 0]) / np.sum(S2[:, 0])
            # channel1
            recovered_var1 = np.sum(S2[:k, 1]) / np.sum(S2[:, 1])
            # channel2
            recovered_var2 = np.sum(S2[:k, 2]) / np.sum(S2[:, 2])
            recovered_var = [recovered_var0, recovered_var1, recovered_var2]

        return recovered_var        
        
#         raise NotImplementedError

 
1.2 Black and white [5 pts] **[W]**
Use your implementation to generate a set of images compressed to different degrees. Include the images in your non-programming submission to the assignment.
In [5]:
# HELPER CELL, DO NOT MODIFY
imcompression = ImgCompression()
bw_image = rgb2gray(image)
U, S, V = imcompression.svd(bw_image)
component_num = [1,2,5,10,20,40,80,160,256]

fig = plt.figure(figsize=(18, 18))

# plot several images
i=0
for k in component_num:
    img_rebuild = imcompression.rebuild_svd(U, S, V, k)
    c = np.around(imcompression.compression_ratio(bw_image, k), 4)
    r = np.around(imcompression.recovered_variance_proportion(S, k), 3)
    ax = fig.add_subplot(3, 3, i + 1, xticks=[], yticks=[])
    ax.imshow(img_rebuild, cmap=plt.cm.bone)
    ax.set_title(f"{k} Components")
    ax.set_xlabel(f"Compression: {c},\nRecovered Variance: {r}")
    i = i+1

 
 
 
1.3 Color image [5 pts] **[W]**
Use your implementation to generate a set of images compressed to different degrees. Include the images in your non-programming submission to the assignment.

Note: You might get warning "Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers)." This warning is acceptable since while rebuilding some of the pixels may go above 1.0. You should see similar image to original even with such clipping.
In [6]:
# HELPER CELL, DO NOT MODIFY
imcompression = ImgCompression()
U, S, V = imcompression.svd(image)

# component_num = [1,2,5,10,20,40,80,160,256]
component_num = [1,2,5,10,20,40,80,160,256]

fig = plt.figure(figsize=(18, 18))

# plot several images
i=0
for k in component_num:
    img_rebuild = imcompression.rebuild_svd(U, S, V, k)
    c = np.around(imcompression.compression_ratio(image, k), 4)
    r = np.around(imcompression.recovered_variance_proportion(S, k), 3)
    ax = fig.add_subplot(3, 3, i + 1, xticks=[], yticks=[])
    ax.imshow(img_rebuild)
    ax.set_title(f"{k} Components")
    ax.set_xlabel(f"Compression: {np.around(c,4)},\nRecovered Variance:  R: {r[0]}  G: {r[1]}  B: {r[2]}")
    i = i+1

 
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).

 
 
 
2 Understanding PCA  **[P]** | **[W]**
 
2.1 Implementation  **[P]**
Principal Component Analysis (PCA) is another dimensionality reduction technique that reduces dimensions by eliminating small variance eigenvalues and their vectors. With PCA, we center the data first by subtracting the mean. Each singular value tells us how much of the variance of a matrix (e.g. image) is captured in each component. In this problem, we will investigate how PCA can be used to improve features for regression and classification tasks and how the data itself affects the behavior of PCA.

Implement PCA in the below cell.

Assume a dataset is composed of N datapoints, each of which has D features with D < N. The dimension of our data would be D. It is possible, however, that many of these dimensions contain redundant information. Each feature explains part of the variance in our dataset. Some features may explain more variance than others.

In the following cell complete the PCA class by completing functions fit, transform and transform_rv.
In [7]:
class PCA(object):

    def __init__(self):
        self.U = None
        self.S = None
        self.V = None

    def fit(self, X): # 
        """
        Decompose dataset into principal components.
        You may use your SVD function from the previous part in your implementation or numpy.linalg.svd function.

        Don't return anything. You can directly set self.U, self.S and self.V declared in __init__ with
        corresponding values from PCA.

        Args:
            X: N*D array corresponding to a dataset
        Return:
            None
        """
        mu = np.mean(X, axis = 0, keepdims = True)
        X = X - mu
        U, S, V = np.linalg.svd(X, full_matrices = False)
        self.U = U
        self.S = S
        self.V = V
        
#         raise NotImplementedError

    def transform(self, data, K=2): # 2 pts
        """
        Transform data to reduce the number of features such that final data has given number of columns

        Utilize self.U, self.S and self.V that were set in fit() method.

        Args:
            data: N*D array corresponding to a dataset
            K: Int value for number of columns to be kept
        Return:
            X_new: N*K array corresponding to data obtained by applying PCA on data
        """
        self.fit(data)
        X_new = np.dot(data, self.V.T[:, :K])
        return X_new
        
#         raise NotImplementedError

    def transform_rv(self, data, retained_variance=0.99): # 3 pts
        """
        Transform data to reduce the number of features such that a given variance is retained

        Utilize self.U, self.S and self.V that were set in fit() method.

        Args:
            data: N*D array corresponding to a dataset
            retained_variance: Float value for amount of variance to be retained
        Return:
            X_new: N*K array corresponding to data obtained by applying PCA on data
        """
        self.fit(data)
        var = np.cumsum(self.S ** 2)
        var = var / var[-1]

        N = data.shape[0]

        for i in range(N):
            if var[i] retained_variance:
                break

        X_new = np.dot(data, self.V.T[:, :i + 1])
        return X_new        

#         raise NotImplementedError

    def get_V(self):
        """ Getter function for value of V """
        
        return self.V

 
2.2 Visualize [5 pts] **[W]**
PCA is used to transform multivariate data tables into smaller sets so as to observe the hidden trends and variations in the data. Here you will visualize two datasets (iris and wine) using PCA. Use the above implementation of PCA and reduce the datasets such that they contain only two features. Make 2-D scatter plots of the data points using these features. Make sure to differentiate the data points according to their true labels. The datasets have already been loaded for you. In addition, return the retained variance obtained from the reduced features.
In [8]:
def visualize(X,y): # 
    """
    Args:
        xtrain: NxD numpy array, where N is number of instances and D is the dimensionality of each instance
        ytrain: numpy array (N,), the true labels
  
    Return:
        None
    """
    N = X.shape[0]
    
    #do dimensionality reduction
    obj = PCA()
    x_new = obj.transform(X, 2)
    
    #add y values as column to xnew    
    y_new = y.reshape((N, 1))
    x = np.hstack((x_new, y_new))
    
    #make scatter plot
    plot = plt.scatter(x[:, 0], x[:, 1], c = x[:, 2], marker = 'x')
    
#     raise NotImplementedError

In [9]:
# HELPER CELL, DO NOT MODIFY
#Use PCA for visualization of iris and wine data
iris_data = load_iris(return_X_y=True)

X = iris_data[0]
y = iris_data[1]

visualize(X, y)

 
 
In [10]:
# HELPER CELL, DO NOT MODIFY
wine_data = load_wine(return_X_y=True)

X = wine_data[0]
y = wine_data[1]

visualize(X, y)

 
 
 
Now you will use PCA on an actual real-world dataset. We will use your implementation of PCA function to reduce the dataset with 99% retained variance and use it to obtain the reduced features. On the reduced dataset, we will use logistic or linear regression and compare results between PCA and non-PCA datasets. Run the following cells to see how PCA works on regression and classification tasks.
In [11]:
# HELPER CELL, DO NOT MODIFY
#load the dataset 
iris = load_iris()

X = iris.data
y = iris.target

print("data shape before PCA ",X.shape)

pca = PCA()
pca.fit(X)

X_pca = pca.transform_rv(X)

print("data shape with PCA ",X_pca.shape)

 
data shape before PCA  (150, 4)
data shape with PCA  (150, 3)

In [12]:
# HELPER CELL, DO NOT MODIFY
# Train, test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, 
                                                    stratify=y, 
                                                    random_state=42)

# Use logistic regression to predict classes for test set
clf = LogisticRegression()
clf.fit(X_train, y_train)
preds = clf.predict_proba(X_test)
print('Accuracy: {:.5f}'.format(accuracy_score(y_test, 
                                                preds.argmax(axis=1))))

 
Accuracy: 0.91111

In [13]:
# HELPER CELL, DO NOT MODIFY
# Train, test splits
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=.3, 
                                                    stratify=y, 
                                                    random_state=42)

# Use logistic regression to predict classes for test set
clf = LogisticRegression()
clf.fit(X_train, y_train)
preds = clf.predict_proba(X_test)
print('Accuracy: {:.5f}'.format(accuracy_score(y_test, 
                                                preds.argmax(axis=1))))

 
Accuracy: 0.91111

In [14]:
# HELPER CELL, DO NOT MODIFY
def apply_regression(X_train, y_train, X_test):
    ridge = Ridge()
    weight = ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    
    return y_pred

In [15]:
# HELPER CELL, DO NOT MODIFY
#load the dataset 
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

print(X.shape, y.shape)

pca = PCA()
pca.fit(X)

X_pca = pca.transform_rv(X)
print("data shape with PCA ",X_pca.shape)

 
(442, 10) (442,)
data shape with PCA  (442, 8)

In [16]:
# HELPER CELL, DO NOT MODIFY
# Train, test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

#Ridge regression without PCA
y_pred = apply_regression(X_train, y_train, X_test)

# calculate RMSE 
rmse_score = np.sqrt(mean_squared_error(y_pred, y_test))
print("rmse score without PCA",rmse_score)

 
rmse score without PCA 55.79391924562032

In [17]:
# HELPER CELL, DO NOT MODIFY
#Ridge regression with PCA
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=.3, random_state=42)

#use Ridge Regression for getting predicted labels
y_pred = apply_regression(X_train,y_train,X_test)

#calculate RMSE 
rmse_score = np.sqrt(mean_squared_error(y_pred, y_test))
print("rmse score with PCA",rmse_score)

 
rmse score with PCA 55.78489213087043

 
For both the tasks above we see an improvement in performance by reducing our dataset with PCA.

Feel free to add other datasets in cell below and play around with what kind of improvement you get with using PCA. There are no points for playing around with other datasets.
In [18]:
######## YOUR CODE BELOW ########

#################################

 
3 Polynomial regression and regularization
 
3.1 Regression and regularization implementations 
We have three methods to fit linear and ridge regression models: 1) close form; 2) gradient descent (GD); 3) Stochastic gradient descent (SGD). For undergraduate students, you are required to implement the closed form for linear regression and for ridge regression, the others 4 methods are bonus parts. For graduate students, you are required to implement all of them. We use the term weight in the following code. Weights and parameters (θθ) have the same meaning here. We used parameters (θθ) in the lecture slides.
In [19]:
class Regression(object):
    
    def __init__(self):
        pass
    
    def rmse(self, pred, label): # [5pts]
        '''
        This is the root mean square error.
        Args:
            pred: numpy array of length N * 1, the prediction of labels
            label: numpy array of length N * 1, the ground truth of labels
        Return:
            a float value
        '''
        rmse = np.sqrt(np.mean((pred - label) ** 2))
        return rmse
        
#         raise NotImplementedError
    
    def construct_polynomial_feats(self, x, degree): # [5pts]
        """
        Args:
            x: numpy array of length N, the 1-D observations
            degree: the max polynomial degree
        Return:
            feat: numpy array of shape Nx(degree+1), remember to include 
            the bias term. feat is in the format of:
            [[1.0, x1, x1^2, x1^3, ....,],
             [1.0, x2, x2^2, x2^3, ....,],
             ......
            ]
        """
        N = x.shape[0]
        feat = np.ones((N))
        for i in range(1, degree + 1):
            x_term = np.power(x, i)
            temp = np.column_stack((feat, x_term))
            feat = temp
        return feat
        
#         raise NotImplementedError


    def predict(self, xtest, weight): # [5pts]
        """
        Args:
            xtest: NxD numpy array, where N is number 
                   of instances and D is the dimensionality of each 
                   instance
            weight: Dx1 numpy array, the weights of linear regression model
        Return:
            prediction: Nx1 numpy array, the predicted labels
        """
        prediction = np.dot(xtest, weight)
        return prediction
        
#         raise NotImplementedError
        
    # =================
    # LINEAR REGRESSION
    # Hints: in the fit function, use close form solution of the linear regression to get weights. 
    # For inverse, you can use numpy linear algebra function  
    # For the predict, you need to use linear combination of data points and their weights (y = theta0*1+theta1*X1+...)

    def linear_fit_closed(self, xtrain, ytrain): # [5pts]
        """
        Args:
            xtrain: N x D numpy array, where N is number of instances and D is the dimensionality of each instance
            ytrain: N x 1 numpy array, the true labels
        Return:
            weight: Dx1 numpy array, the weights of linear regression model
        """
        # term1
        t1 = np.linalg.pinv(np.dot(xtrain.T, xtrain))
        # term2
        t2 = np.dot(t1, xtrain.T)
        weight = np.dot(t2, ytrain)
        return weight
        
#         raise NotImplementedError

    def linear_fit_GD(self, xtrain, ytrain, epochs=5, learning_rate=0.001): # [5pts]
        """
        Args:
            xtrain: NxD numpy array, where N is number 
                    of instances and D is the dimensionality of each 
                    instance
            ytrain: Nx1 numpy array, the true labels
        Return:
            weight: Dx1 numpy array, the weights of linear regression model
        """
        N, D = xtrain.shape[0], xtrain.shape[1]
        weight = np.zeros((D, 1))
        for i in range(epochs):
            temp = weight + learning_rate * (np.dot(xtrain.T, (ytrain - np.dot(xtrain, weight)))) / N
            weight = temp
        return weight
        
#         raise NotImplementedError

    def linear_fit_SGD(self, xtrain, ytrain, epochs=100, learning_rate=0.001): # [5pts]
        """
        Args:
            xtrain: NxD numpy array, where N is number 
                    of instances and D is the dimensionality of each 
                    instance
            ytrain: Nx1 numpy array, the true labels
        Return:
            weight: Dx1 numpy array, the weights of linear regression model
        """
        N, D = xtrain.shape[0], xtrain.shape[1]
        weight = np.zeros((D, 1))
        for i in range(epochs):
            t1 = ytrain - np.dot(xtrain, weight)
            idx = i % N
            t2 = np.dot(xtrain[idx, :].T.reshape((D, 1)), t1[idx].reshape((1, 1)))
            temp = weight + learning_rate * t2
            weight = temp
        return weight
        
#         raise NotImplementedError
        
    # =================
    # RIDGE REGRESSION
        
    def ridge_fit_closed(self, xtrain, ytrain, c_lambda): # [5pts]
        """
        Args:
            xtrain: N x D numpy array, where N is number of instances and D is the dimensionality of each instance
            ytrain: N x 1 numpy array, the true labels
            c_lambda: floating number
        Return:
            weight: Dx1 numpy array, the weights of ridge regression model
        """
        N, D = xtrain.shape[0], xtrain.shape[1]
        ide = np.identity(D)
        ide[0][0] = 0.0           
        # term1
        t1 = np.linalg.pinv(np.dot(xtrain.T, xtrain) + c_lambda * ide)
        # term2
        t2 = np.dot(t1, xtrain.T)
        weight = np.dot(t2, ytrain)
        return weight
        
#         raise NotImplementedError

        
    def ridge_fit_GD(self, xtrain, ytrain, c_lambda, epochs=500, learning_rate=1e-7): # [5pts]
        """
        Args:
            xtrain: NxD numpy array, where N is number 
                    of instances and D is the dimensionality of each 
                    instance
            ytrain: Nx1 numpy array, the true labels
            c_lambda: floating number
        Return:
            weight: Dx1 numpy array, the weights of linear regression model
        """
        N, D = xtrain.shape[0], xtrain.shape[1]
        weight = np.zeros((D, 1))
        for i in range(epochs):
            #term 1
            t1 = np.dot(xtrain.T, (ytrain - np.dot(xtrain, weight)))
            temp = weight + learning_rate * (t1 + c_lambda * weight) / N
            weight = temp
        return weight
        
#         raise NotImplementedError

    def ridge_fit_SGD(self, xtrain, ytrain, c_lambda, epochs=100, learning_rate=0.001): # [5pts]
        """
        Args:
            xtrain: NxD numpy array, where N is number 
                    of instances and D is the dimensionality of each 
                    instance
            ytrain: Nx1 numpy array, the true labels
        Return:
            weight: Dx1 numpy array, the weights of linear regression model
        """
        N, D = xtrain.shape[0], xtrain.shape[1]
        weight = np.zeros((D, 1))
        for i in range(epochs):
            t1 = ytrain - np.dot(xtrain, weight)
            idx = i % N
            t2 = np.dot(xtrain[idx, :].T.reshape((D, 1)), t1[idx].reshape((1, 1)))
            temp = weight + learning_rate * (t2 + c_lambda * weight)
            weight = temp
        return weight
        
#         raise NotImplementedError

    def ridge_cross_validation(self, X, y, kfold=10, c_lambda=100): # [5 pts]
        """
        Args: 
            X : NxD numpy array, where N is the number of instances and D is the dimensionality of each instance
            y : Nx1 numpy array, true labels
            kfold: Number of folds you should take while implementing cross validation.
            c_lambda: Value of regularization constant
        Returns:
            meanErrors: Float average rmse error
        Hint: np.concatenate might be helpful.
        Look at 3.5 to see how this function is being used.
        # For cross validation, use 10-fold method and only use it for your training data (you already have the train_indices to get training data).
        # For the training data, split them in 10 folds which means that use 10 percent of training data for test and 90 percent for training.
        """
        N, D = X.shape[0], X.shape[1]
        size_of_fold = int(N / kfold)
        meanErrors = 0.0
        for i in range(kfold):
            # train model on every fold except ith
            xtrain1 = X[:i * size_of_fold, :]
            xtrain2 = X[(i + 1) * size_of_fold:, :]
            xtrain = np.concatenate((xtrain1, xtrain2))
            ytrain1 = y[:i * size_of_fold]
            ytrain2 = y[(i + 1) * size_of_fold:]
            ytrain = np.concatenate((ytrain1, ytrain2))
            weight = self.ridge_fit_closed(xtrain, ytrain, c_lambda)

            # compute error on ith fold
            xtest = X[i * size_of_fold:(i + 1) * size_of_fold, :]
            ytest = y[i * size_of_fold:(i + 1) * size_of_fold]
            pred = self.predict(xtest, weight)
            error = self.rmse(pred, ytest)

            # add to error
            meanErrors += error

        meanErrors /= kfold
        return meanErrors
            
        
#         raise NotImplementedError

 
3.2 About RMSE [3 pts] **[W]**
Do you know whether this RMSE is good or not? If you don't know, we could normalize our labels between 0 and 1. After normalization, what does it mean when RMSE = 1?

Hint: think of the way that you can enforce your RMSE = 1. Note that you can not change the actual labels to make RMSE = 1.
 
Although we know that we desire a small RMSE value for a good regression model, it might not be the best indicator of a good model just by itself. However, if we normalize our data and labels, the RMSE value should fall between 0 and 1, in which case it is easy to get an idea of whether we have a good model at hand. In a normalized case, RMSE = 1 would mean a very high error, which is not desirable, and the model is not good.
 
3.3 Testing: general functions and linear regression **[W]**
Let's first construct a dataset for polynomial regression.

In this case, we construct the polynomial features up to degree 5.. Each data sample consists of two features [a,b][a,b]. We compute the polynomial features of both a and b in order to yield the vectors [1,a,a2,a3,...adegree][1,a,a2,a3,...adegree] and [1,b,b2,b3,...,bdegree][1,b,b2,b3,...,bdegree]. We train our model with the cartesian product of these polynomial features. The cartesian product generates a new feature vector consisting of all polynomial combinations of the features with degree less than or equal to the specified degree.

For example, if degree = 2, we will have the polynomial features [1,a,a2][1,a,a2] and [1,b,b2][1,b,b2] for the datapoint [a,b][a,b]. The cartesian product of these two vectors will be [1,a,b,ab,a2,b2][1,a,b,ab,a2,b2]. We do not generate a3a3 and b3b3 since their degree is greater than 2 (specified degree).
In [20]:
#helper, do not need to change
POLY_DEGREE = 5
NUM_OBS = 1000

rng = np.random.RandomState(seed=4)

true_weight = -rng.rand((POLY_DEGREE)**2+2, 1)
true_weight[2:, :] = 0
x_all1 = np.linspace(-5, 5, NUM_OBS)
x_all2 = np.linspace(-3, 3, NUM_OBS)
x_all = np.stack((x_all1,x_all2), axis=1)

reg = Regression()
x_all_feat1 = reg.construct_polynomial_feats(x_all[:,0], POLY_DEGREE)
x_all_feat2 = reg.construct_polynomial_feats(x_all[:,1], POLY_DEGREE)

x_cart_flat = []
for i in range(x_all_feat1.shape[0]):
    x1 = x_all_feat1[i]
    x2 = x_all_feat2[i]
    x1_end = x1[-1]
    x2_end = x2[-1]
    x1 = x1[:-1]
    x2 = x2[:-1]
    x3 = np.asarray([[m*n for m in x1] for n in x2])

    x3_flat = np.reshape(x3, (x3.shape[0]**2))
    x3_flat = list(x3_flat)
    x3_flat.append(x1_end)
    x3_flat.append(x2_end)
    x3_flat = np.asarray(x3_flat)
    x_cart_flat.append(x3_flat)
    
x_cart_flat = np.asarray(x_cart_flat)
x_all_feat = np.copy(x_cart_flat)

y_all = np.dot(x_cart_flat, true_weight) + rng.randn(x_all_feat.shape[0], 1) # in the second term, we add noise to data
print(x_all_feat.shape, y_all.shape)

# Note that here we try to produce y_all as our training data
#plot_curve(x_all, y_all) # Data with noise that we are going to predict
#plot_curve(x_all, np.dot(x_cart_flat, true_weight), curve_type='-', color='r', lw=4) # the groundtruth information

indices = rng.permutation(NUM_OBS)

 
(1000, 27) (1000, 1)

In [21]:
# HELPER CELL, DO NOT MODIFY
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

p = np.reshape(np.dot(x_cart_flat, true_weight), (1000,))
print(x_all[:,0].shape, x_all[:,1].shape,p.shape)
ax.plot(x_all[:,0], x_all[:,1], p, c="red",linewidth=4)
ax.scatter(x_all[:,0], x_all[:,1], y_all,s=4)

 
(1000,) (1000,) (1000,)

Out[21]:
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7fd7d738a940
 
 
 
In the figure above, the red curve is the true fuction we want to learn, while the blue dots are the noisy observations. The observations are generated by Y=Xθ+σY=Xθ+σ , where σ∼N(0,1) are i.i.d. generated noise.

Now let's split the data into two parts, namely the training set and test set. The red dots are for training, while the blue dots are for testing.
In [22]:
# HELPER CELL, DO NOT MODIFY
train_indices = indices[:NUM_OBS//2]
test_indices = indices[NUM_OBS//2:]

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

xtrain = x_all[train_indices]
ytrain = y_all[train_indices]
xtest = x_all[test_indices]
ytest = y_all[test_indices]

print(xtrain.shape, xtest.shape, y_all.shape)
ax.scatter(xtrain[:,0], xtrain[:,1], ytrain, c='r',s=4)
ax.scatter(xtest[:,1], xtest[:,1], ytest, c='b',s=4)

 
(500, 2) (500, 2) (1000, 1)

Out[22]:
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7fd7d7228710
 
 
 
Now let's first train using the entire training set, and see how we performs on the test set and how the learned function look like.
In [23]:
# HELPER CELL, DO NOT MODIFY
weight = reg.linear_fit_closed(x_all_feat[train_indices], y_all[train_indices])
y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)

 
test rmse: 0.9589

In [24]:
# HELPER CELL, DO NOT MODIFY
weight = reg.linear_fit_GD(x_all_feat[train_indices], y_all[train_indices], epochs=500000, learning_rate=1e-9)
y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)

 
test rmse: 1.2971

 
And what if we just use the first 10 observations to train?
In [25]:
# HELPER CELL, DO NOT MODIFY
sub_train = train_indices[:10]
weight = reg.linear_fit_closed(x_all_feat[sub_train], y_all[sub_train])
y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)

 
test rmse: 5.2039

 
Did you see a worse performance? Let's take a closer look at what we have learned.
In [26]:
# HELPER CELL, DO NOT MODIFY
y_pred = reg.predict(x_all_feat, weight)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x1 = x_all[:,0]
x2 = x_all[:,0]
y_pred = np.reshape(y_pred, (1000,))
ax.plot(x1, x2, y_pred, color='b', lw=4)

x3 = x_all[sub_train,0]
x4 = x_all[sub_train,1]
ax.scatter(x3, x4, y_all[sub_train], s=100, c='r', marker='x')

y_test_pred = reg.predict(x_all_feat[test_indices], weight)

 
 
 
3.4.1 Testing: ridge regression [5 pts] **[W]**
Now let's try ridge regression. Similarly, undergraduate students need to implement the closed form, and graduate students need to implement all the three methods. We will call the prediction function from linear regression part.
 
Again, let's see what we have learned. You only need to run the cell corresponding to your specific implementation.
In [27]:
# HELPER CELL, DO NOT MODIFY
sub_train = train_indices[:10]
print(x_all_feat[sub_train].shape)
print(y_all[sub_train].shape)
weight = reg.ridge_fit_closed(x_all_feat[sub_train], y_all[sub_train], c_lambda=1000)

y_pred = reg.predict(x_all_feat, weight)

y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x1 = x_all[:,0]
x2 = x_all[:,0]
y_pred = np.reshape(y_pred, (1000,))
ax.plot(x1, x2, y_pred, color='b', lw=4)

x3 = x_all[sub_train,0]
x4 = x_all[sub_train,1]
ax.scatter(x3, x4, y_all[sub_train], s=100, c='r', marker='x')

y_test_pred = reg.predict(x_all_feat[test_indices], weight)

 
(10, 27)
(10, 1)
test rmse: 1.4805

 
 
In [28]:
# HELPER CELL, DO NOT MODIFY
sub_train = train_indices[:10]
weight = reg.ridge_fit_GD(x_all_feat[sub_train], y_all[sub_train], c_lambda=1000, learning_rate=1e-9)

y_pred = reg.predict(x_all_feat, weight)

y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x1 = x_all[:,0]
x2 = x_all[:,0]
y_pred = np.reshape(y_pred, (1000,))
ax.plot(x1, x2, y_pred, color='b', lw=4)

x3 = x_all[sub_train,0]
x4 = x_all[sub_train,1]
ax.scatter(x3, x4, y_all[sub_train], s=100, c='r', marker='x')

y_test_pred = reg.predict(x_all_feat[test_indices], weight)

 
test rmse: 1.6795

 
 
In [29]:
# HELPER CELL, DO NOT MODIFY
sub_train = train_indices[:10]
weight = reg.ridge_fit_SGD(x_all_feat[sub_train], y_all[sub_train], c_lambda=1000, learning_rate=1e-9)

y_pred = reg.predict(x_all_feat, weight)

y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x1 = x_all[:,0]
x2 = x_all[:,0]
y_pred = np.reshape(y_pred, (1000,))
ax.plot(x1, x2, y_pred, color='b', lw=4)

x3 = x_all[sub_train,0]
x4 = x_all[sub_train,1]
ax.scatter(x3, x4, y_all[sub_train], s=100, c='r', marker='x')

y_test_pred = reg.predict(x_all_feat[test_indices], weight)

 
test rmse: 1.6893

 
 
 
3.4.2 Lasso and Ridge Regression  **[W]**
We train two linear regression models with different regularizations- one with lasso regularization and the other with ridge regularization. Let w1w1 be the final weight vector for the model with lasso regularization and let w2w2 be the final weight vector for the model with ridge regularization. How do w1w1 and w2w2 differ in terms of sparsity? For ridge regression, how do the weights change with change in lambda?
 
If we train two linear regression models with lasso and ridge regularization, then w1w1, the final weights obtained with lasso regularization will be more sparse than w2w2, the final weight obtained from ridge regularization. This is because the lasso penalty penalizes the weights and makes them zero quickly. On the other hand, while ridge regularization also penalizes the weights, it makes them smaller, but does not necessarily make them zero.

For ridge regression, a higher lambda would mean higher penalty, which would result in smaller weights.
 
3.5 Cross validation [7 pts] **[W]**
Let's use Cross Validation to find the best value for c_lambda in ridge regression.
In [30]:
# We provided 6 possible values for lambda, and you will use them in cross validation.
# For cross validation, use 10-fold method and only use it for your training data (you already have the train_indices to get training data).
# For the training data, split them in 10 folds which means that use 10 percent of training data for test and 90 percent for training.
# At the end for each lambda, you have caluclated 10 rmse and get the mean value of that.
# That's it. Pick up the lambda with the lowest mean value of rmse. 
# Hint: np.concatenate is your friend.
best_lambda = None
best_error = None
kfold = 10
lambda_list = [0.1, 1, 5, 10, 100, 1000]

for lm in lambda_list:
    err = reg.ridge_cross_validation(x_all_feat[train_indices], y_all[train_indices], kfold, lm)
    print('lambda: %.2f' % lm, 'error: %.6f'% err)
    if best_error is None or err < best_error:
        best_error = err
        best_lambda = lm

print('best_lambda: %.2f' % best_lambda)
weight = reg.ridge_fit_closed(x_all_feat[train_indices], y_all[train_indices], c_lambda=10)
y_test_pred = reg.predict(x_all_feat[test_indices], weight)
test_rmse = reg.rmse(y_test_pred, y_all[test_indices])
print('test rmse: %.4f' % test_rmse)  

 
lambda: 0.10 error: 1.015281
lambda: 1.00 error: 1.015235
lambda: 5.00 error: 1.015084
lambda: 10.00 error: 1.014995
lambda: 100.00 error: 1.019286
lambda: 1000.00 error: 1.035747
best_lambda: 10.00
test rmse: 0.9593

 
4. Naive Bayes Classification 
 
In Bayesian classification, we're interested in finding the probability of a label given some observed feature vector x=[x1,..,xd]x=[x1,..,xd], which we can write as P(y | x1,..,xd)P(y | x1,..,xd). Bayes's theorem tells us how to express this in terms of quantities we can compute more directly:

P(y | x1,..,xd)=P(x1,..,xd | y)P(y)P(x1,..,xd)P(y | x1,..,xd)=P(x1,..,xd | y)P(y)P(x1,..,xd)
The main assumption in Naive Bayes is that, given the label, the observed features are conditionally independent i.e.

P(x1,..,xd | y)=P(x1 | y)×..×P(xd | y)P(x1,..,xd | y)=P(x1 | y)×..×P(xd | y)
Therefore, we can rewrite Bayes rule as

P(y | x1,..,xd)=P(x1 | y)×..×P(xd | y)P(y)P(x1,..,xd)P(y | x1,..,xd)=P(x1 | y)×..×P(xd | y)P(y)P(x1,..,xd)
Training Naive Bayes
One way to train a Naive Bayes classifier is done using frequentist approach to calculate probability, which is simply going over the training data and calculating the frequency of different observations in the training set given different labels. For example,

P(x1=i | y=j)=P(x1=i,y=j)P(y=j)=Number of times in training data x1=i and y=jTotal number of times in training data y=jP(x1=i | y=j)=P(x1=i,y=j)P(y=j)=Number of times in training data x1=i and y=jTotal number of times in training data y=j
Testing Naive Bayes
During the testing phase, we try to estimate the probability of a label given an observed feature vector. We combine the probabilities computed from training data to estimate the probability of a given label. For example, if we are trying to decide between two labels y1y1 and y2y2, then we compute the ratio of the posterior probabilities for each label:

P(y1 | x1,..,xd)P(y2 | x1,..,xd)=P(x1,..,xd | y1)P(x1,..,xd | y2)P(y1)P(y2)=P(x1 | y1)×..×P(xd | y1)P(y1)P(x1 | y2)×..×P(xd | y2)P(y2)P(y1 | x1,..,xd)P(y2 | x1,..,xd)=P(x1,..,xd | y1)P(x1,..,xd | y2)P(y1)P(y2)=P(x1 | y1)×..×P(xd | y1)P(y1)P(x1 | y2)×..×P(xd | y2)P(y2)
All we need now is to compute P(x1|yi),..,P(xd | yi)P(x1|yi),..,P(xd | yi) and P(yi)P(yi) for each label by pluging in the numbers we got during training. The label with the higher posterior probabilities is the one that is selected.
 
4.1 Bayes in Advertisements [5pts] **[W]**
 
An advertising agency want to analyze the advertisements for a product. They want to target people from all age groups. They show 5 advertisement videos to 200 people. The following is the data on how many people from each group liked which videos.

Age Group
Total
Video 1
Video 2
Video 3
Video 4
Video 5
16 - 35
100
20
30
60
15
90
36 - 55
60
15
50
30
20
40
Above 55
40
35
30
10
10
5
Total
200
70
110
100
45
135
A new consumer is shown the videos and he likes videos 2, 3 and 5. Which age group is he most likely to belong to?

Note: You can assume that each person provides opinion about each video independently i.e. Person 1 liking Video 1 has no effect on their assesment of Video 2.
 
Given, the person likes videos 2, 3, and 5. For calculation purposes, we will assume that the person does not like videos 1 and 4. Now, we will find the probability of each age group liking videos 2, 3 and 5, and disliking videos 1 and 4, the age group with the highest probability will be the age group this new person belongs to.

First, let's find the prior probabilities of each age group. We have:

P(age=16−35)=100200=0.5P(age=16−35)=100200=0.5
P(age=36−55)=60200=0.3P(age=36−55)=60200=0.3
P(age=55+)=40200=0.2P(age=55+)=40200=0.2
Now, let this new person be represented by

x={x1=0,x2=1,x3=1,x4=0,x5=1}x={x1=0,x2=1,x3=1,x4=0,x5=1}
Now, the posterior probabilities can be calculated by Bayes' rule as

P(age=16−35|x)=P(x|age=16−35)∗P(age=16−35)P(x)P(age=16−35|x)=P(x|age=16−35)∗P(age=16−35)P(x)
P(age=36−55|x)=P(x|age=36−55)∗P(age=36−55)P(x)P(age=36−55|x)=P(x|age=36−55)∗P(age=36−55)P(x)
P(age=55+|x)=P(x|age=55+)∗P(age=55+)P(x)P(age=55+|x)=P(x|age=55+)∗P(age=55+)P(x)
Now, for comparison, we can get rid of the denominator as it is the same in all the three equations. Thus, calculating the numerator for the above equations gives us:

P(x|age=16−35)∗P(age=16−35)P(x2|age=16−35)∗P(x3|age=16−35)∗P(x4|age=16−35)∗P(x5|age=16−35)∗P(age=16−35)=P(x1|age=16−35)∗=80∗30∗60∗85∗901005∗0.5=0.11∗0.5=0.055P(x|age=16−35)∗P(age=16−35)=P(x1|age=16−35)∗P(x2|age=16−35)∗P(x3|age=16−35)∗P(x4|age=16−35)∗P(x5|age=16−35)∗P(age=16−35)=80∗30∗60∗85∗901005∗0.5=0.11∗0.5=0.055
P(x|age=36−55)∗P(age=36−55)P(x2|age=36−55)∗P(x3|age=36−55)∗P(x4|age=36−55)∗P(x5|age=36−55)∗P(age=36−55)=P(x1|age=36−55)∗=45∗50∗30∗40∗40605∗0.3=0.1389∗0.3=0.0417P(x|age=36−55)∗P(age=36−55)=P(x1|age=36−55)∗P(x2|age=36−55)∗P(x3|age=36−55)∗P(x4|age=36−55)∗P(x5|age=36−55)∗P(age=36−55)=45∗50∗30∗40∗40605∗0.3=0.1389∗0.3=0.0417
P(x|age=55+)∗P(age=55+)P(x2|age=55+)∗P(x3|age=55+)∗P(x4|age=55+)∗P(x5|age=55+)∗P(age=55+)=P(x1|age=55+)∗=5∗30∗10∗30∗5405∗0.2=0.002∗0.2=0.0004P(x|age=55+)∗P(age=55+)=P(x1|age=55+)∗P(x2|age=55+)∗P(x3|age=55+)∗P(x4|age=55+)∗P(x5|age=55+)∗P(age=55+)=5∗30∗10∗30∗5405∗0.2=0.002∗0.2=0.0004
Since the probability of age group 16-35 is highest, the new person belongs to this age group.
 
4.2 The Federalist Papers [15pts] **[P]**
 
The Federalist Papers were a series of essays written in 1787–1788 meant to persuade the citizens of the State of New York to ratify the Constitution and which were published anonymously under the pseudonym “Publius”. In later years the authors were revealed as Alexander Hamilton, John Jay, and James Madison. However, there is some disagreement as to who wrote which essays. Hamilton wrote a list of which essays he had authored only days before being killed in a duel with then Vice President Aaron Burr. Madison wrote his own list many years later, which is in conflict with Hamilton’s list on 12 of the essays. Since by this point the two (who were once close friends) had become bitter rivals, historians have long been unsure as to the reliability of both lists. We will try to settle this dispute using a simple Naive Bayes classifier.

The code which is provided loads the documents and builds a “bag of words” representation of each document. Your task is to complete the missing portions of the code and to determine your best guess as to who wrote each of the 12 disputed essays. (Hint: H and M are the labels that stand for Hamilton and Madison, while the label D stands for disputed for the papers we are trying to label in our data. Our job here is to define whether D essays belong to H or M using Naive Bayes. Note that the label D for disputed, is completely unrelated to the feature dimension D which is an integer).

_priors_ratio function calculates the ratio of class probabilities of document belonging to Hamilton as compared to Madison. We do this based on word counts rather than document counts.

_likelihood_ratio function calculates the ratio of word probablities given the author it belonged to.

Note 1: In _likelihood_ratio() add one to each word count so as to avoid issues with zero word count. This is known as Add-1 smoothing. It is a type of additive smoothing.
In [31]:
class NaiveBayes(object):

    def __init__(self):
        # load Documents
        x = open('fedpapers_split.txt').read()
        papers = json.loads(x)

        # split Documents
        papersH = papers[0] # papers by Hamilton 
        papersM = papers[1] # papers by Madison
        papersD = papers[2] # disputed papers

        # Number of Documents for H, M and D
        nH = len(papersH)
        nM = len(papersM)
        nD = len(papersD)
        
        '''To ignore certain common words in English that might skew your model, we add them to the stop words 
        list below. You may want to experiment by choosing your own list of stop words, but be sure to keep 
        'HAMILTON' and 'MADISON' in this list at a minimum, as their names appear in the text of the papers 
        and leaving them in could lead to unpredictable results '''

        stop_words = text.ENGLISH_STOP_WORDS.union({'HAMILTON','MADISON'})
        #stop_words = {'HAMILTON','MADISON'}
        # Form bag of words model using words used at least 10 times
        vectorizer = text.CountVectorizer(stop_words=stop_words,min_df=10)
        X = vectorizer.fit_transform(papersH+papersM+papersD).toarray()

        '''To visualize the full list of words remaining after filtering out stop words and words used less 
        than min_df times uncomment the following line'''
        #print(vectorizer.vocabulary_)

        # split word counts into separate matrices
        self.XH, self.XM, self.XD = X[:nH,:], X[nH:nH+nM,:], X[nH+nM:,:]
        

    def _likelihood_ratio(self, XH, XM): # 
        '''
        Args:
            XH: nH x D where nH is the number of documents that we have for Hamilton,
                while D is the number of features (we use the word count as the feature)
            XM: nM x D where nM is the number of documents that we have for Madison,
                while D is the number of features (we use the word count as the feature)
        Return:
            fratio: 1 x D vector of the likelihood ratio of different words (Hamilton/Madison)
        '''
        # Estimate probability of each word in vocabulary being used by Hamilton 
        
        # Estimate probability of each word in vocabulary being used by Madison 
        
        # Compute ratio of these probabilities
        # Hamilton
        total_freq_H = np.sum(XH, axis=0, keepdims=True) + 1

        # Madison
        total_freq_M = np.sum(XM, axis=0, keepdims=True) + 1

        fratio = (total_freq_H * np.sum(total_freq_M)) / (total_freq_M * np.sum(total_freq_H))
        return fratio
        
#         raise NotImplementedError
    
    def _priors_ratio(self, XH, XM): # 
        '''
        Args:
            XH: nH x D where nH is the number of documents that we have for Hamilton,
                while D is the number of features (we use the word count as the feature)
            XM: nM x D where nM is the number of documents that we have for Madison,
                while D is the number of features (we use the word count as the feature)
        Return:
            pr: prior ratio of (Hamilton/Madison)
        '''
        words_in_H = np.sum(XH)
        words_in_M = np.sum(XM)
        prior_ratio = float(words_in_H / words_in_M)
        return prior_ratio
        
        
#         raise NotImplementedError

    def classify_disputed(self, fratio, pratio, XD): # 
        '''
        Args:
            fratio: 1 x D vector of ratio of likelihoods of different words
            pratio: 1 x 1 number
            XD: 12 x D bag of words representation of the 12 disputed documents (D = 1307 which are the number of features for each document)
        Return:
             1 x 12 list, each entry is H to indicate Hamilton or M to indicate Madison for the corrsponding document
        '''   
        res = []
        for i in range(12):
            doc = XD[i]
            t1 = np.power(fratio, doc)
            prob_of_H = np.prod(t1) * pratio
            if prob_of_H 0.5:
                res.append('H')
            else:
                res.append('M')
        return res
#         raise NotImplementedError

In [32]:
# HELPER CELL, DO NOT MODIFY
NB = NaiveBayes()
fratio = NB._likelihood_ratio(NB.XH, NB.XM)
pratio = NB._priors_ratio(NB.XH, NB.XM)
resolved = NB.classify_disputed(fratio, pratio, NB.XD)
    
print(resolved)

 
['M', 'M', 'M', 'M', 'H', 'H', 'M', 'H', 'H', 'M', 'M', 'M']

 
5 Noise in PCA and Linear Regression  **[W]**
 
Both PCA and least squares regression can be viewed as algorithms for inferring (linear) relationships among data variables. In this part of the assignment, you will develop some intuition for the differences between these two approaches, and an understanding of the settings that are better suited to using PCA or better suited to using the least squares fit.

The high level bit is that PCA is useful when there is a set of latent (hidden/underlying) variables, and all the coordinates of your data are linear combinations (plus noise) of those variables. The least squares fit is useful when you have direct access to the independent variables, so any noisy coordinates are linear combinations (plus noise) of known variables.

5.1 Slope Functions (5 Pts) **[W]**
In the following cell:

For this function assume that X is the first feature and Y is the second feature for the data. Write a function pca_slope that takes a vector X of xi’s and a vector Y of yi’s and returns the slope of the first component of the PCA.
Write a function linear_regression_slope that takes X and y and returns the slope of the least squares fit. (Hint: since X is one dimensional, this takes a particularly simple form1 ((X−X¯¯¯¯)⋅(Y−Y¯¯¯¯))/∥X−X¯¯¯¯∥22((X−X¯)⋅(Y−Y¯))/‖X−X¯‖22, where X is the mean value of X.)
In later subparts, we consider the case where our data consists of noisy measurements of x and y. For each part, we will evaluate the quality of the relationship recovered by PCA, and that recovered by standard least squares regression.

As a reminder, least squares regression minimizes the squared error of the dependent variable from its prediction. Namely, given (xi,yi)(xi,yi) pairs, least squares returns the line l(x)l(x) that minimizes ∑i(yi−l(xi))2∑i(yi−l(xi))2.

Note 1: You should use the PCA and Linear Regression implementations from Q2 and Q3 in this question. Do not use any kind of regularization for Linear Regression.
In [33]:
def pca_slope(x, y):
    """
    Calculates the slope of the first principal component given by PCA

    Args: 
        x: (N,) vector of feature x
        y: (N,) vector of feature y
    Return:
        slope: slope of the first principal component
    """
    N = x.shape[0]
    
    #make dataset
    x = x.reshape((N, 1))
    y = y.reshape((N, 1))
    data = np.hstack((x, y))
    
    #apply PCA
    obj = PCA()
    obj.fit(data)
    
    #slope of first principal component
    slope = obj.V[0, 1] / obj.V[0, 0]
    
    return slope
    
#     raise NotImplementedError
    

def lr_slope(X, y):
    """
    Calculates the slope of the best fit as given by Linear Regression
    
    For this function don't use any regularization

    Args: 
        X: N*1 array corresponding to a dataset
        y: N*1 array of labels y
    Return:
        slope: slope of the best fit
    """
    obj = Regression()
    slope = obj.linear_fit_closed(X, y)[0]
    
    return slope
    
#     raise NotImplementedError

 
We will consider a simple example with two variables, x and y, where the true relationship between the variables is y = 2x. Our goal is to recover this relationship—namely, recover the coefficient “2”. We set X = [0, .01, .02, .03, . . . , 1] and y = 2x. Make sure both functions return 2.
In [34]:
# HELPER CELL, DO NOT MODIFY
x = np.arange(0, 1, 0.01)
y = 2 * np.arange(0, 1, 0.01)

print("Slope of first principal component", pca_slope(x, y))

print("Slope of best linear fit", lr_slope(x[:, None], y))

plt.scatter(x, y)
plt.xlabel("x")
plt.ylabel("y")
plt.show()

 
Slope of first principal component 2.0
Slope of best linear fit 2.0

 
 
 
5.2 Analysis Setup (5 Pts) **[W]**
Error in y
 
In this subpart, we consider the setting where our data consists of the actual values of xx, and noisy estimates of yy. Run the following cell to see how the data looks when there is error in yy.
In [35]:
# HELPER CELL, DO NOT MODIFY
base = np.arange(0.001, 1, 0.001)
c = 0.5
X = base
y = 2 * base + np.random.normal(loc=[0], scale=c, size=base.shape)

plt.scatter(X, y)
plt.xlabel("x")
plt.ylabel("y")
plt.show()

 
 
 
In the subsequent cell implement the following:

Fix X=[x1,x2,...,x1000]=[.001,.002,.003,...,1]X=[x1,x2,...,x1000]=[.001,.002,.003,...,1].
For a given noise level cc, set y^i∼2xi+N(0,c)=2i/1000+N(0,c)y^i∼2xi+N(0,c)=2i/1000+N(0,c), and Y^=[y^1,y^2,...,y^1000]Y^=[y^1,y^2,...,y^1000]
Make a scatter plot with c on the horizontal axis, and the output of pca-slope and ls-slope on the vertical axis.
For each cc in [0,0.05,0.1,...,.95,1.0][0,0.05,0.1,...,.95,1.0], take a sample Y^Y^, plot the output of pca-recover as a red dot, and the output of ls-recover as a blue dot. Repeat 30 times. You should end up with a plot of 1260 dots, in 21 columns of 60, half red and half blue.
In [36]:
pca_slope_values = []
linreg_slope_values = []
c_values = []

for i in range(30):
    for c in np.arange(0, 1, 0.05):
        ####### YOUR CODE BELOW #######
        #step1
        X = base
        
        #step2
        y = 2 * base + np.random.normal(scale=c, size=base.shape)
        
        #step3
        slope1 = pca_slope(X, y)
        slope2 = lr_slope(X.reshape(-1, 1), y)
        pca_slope_values.append(slope1)
        linreg_slope_values.append(slope2) 
        ###############################

        c_values.append(c)

plt.scatter(c_values, pca_slope_values, c='r')
plt.scatter(c_values, linreg_slope_values, c='b')
plt.xlabel("c")
plt.ylabel("slope")
plt.show()

 
 
 
Error in x and y
 
We now examine the case where our data consists of noisy estimates of both xx and yy. Run the following cell to see how the data looks when there is error in both.
In [37]:
# HELPER CELL, DO NOT MODIFY
base = np.arange(0.001, 1, 0.001)
c = 0.5
X = base + np.random.normal(loc=[0], scale=c, size=base.shape) * 0.5
y = 2 * base + np.random.normal(loc=[0], scale=c, size=base.shape) * 0.5

plt.scatter(X, y)
plt.xlabel("x")
plt.ylabel("y")
plt.show()

 
 
 
In the subsequent cell implement the following:

For a given noise level c, let x^i∼xi+N(0,c)=i/1000+N(0,c)x^i∼xi+N(0,c)=i/1000+N(0,c), and X^=[x^1,x^2,....x^1000]X^=[x^1,x^2,....x^1000]
For the same noise level c, set y^i∼2xi+N(0,c)=2i/1000+N(0,c)y^i∼2xi+N(0,c)=2i/1000+N(0,c), and Y^=[y^1,y^2,...,y^1000]Y^=[y^1,y^2,...,y^1000]
Make a scatter plot with c on the horizontal axis, and the output of pca-slope and ls-slope on the vertical axis. For each cc in [0,0.05,0.1,...,.95,1.0][0,0.05,0.1,...,.95,1.0], take a sample of both X^X^ and Y^Y^, plot the output of pca-recover as a red dot, and the output of ls-recover as a blue dot. Repeat 30 times. You should end up with a plot of 1260 dots, in 21 columns of 60, half red and half blue.
In [38]:
pca_slope_values = []
linreg_slope_values = []
c_values = []

for i in range(30):
    for c in np.arange(0, 1, 0.05):
        ####### YOUR CODE BELOW #######       
        #step2
        X = base + np.random.normal(scale=c, size=base.shape)
        y = 2 * base + np.random.normal(scale=c, size=base.shape)
        
        #step3
        slope1 = pca_slope(X, y)
        slope2 = lr_slope(X.reshape(-1, 1), y)
        pca_slope_values.append(slope1)
        linreg_slope_values.append(slope2)         
        ###############################

        c_values.append(c)

plt.scatter(c_values, pca_slope_values, c='r')
plt.scatter(c_values, linreg_slope_values, c='b')
plt.xlabel("c")
plt.ylabel("slope")
plt.show()

 
 
 
5.3. Analysis  **[W]**
Based on your observations from previous subsections answer the following questions about the two cases (error in XX and error in both XX and YY) in 2-3 lines.

Note:

The closer the value of slope to actual slope ("2" here) the better the algorithm is performing.
You don't need to provide a proof for this question.
Questions:

Which case does PCA perform worse in? Why does PCA perform worse in this case?
Why does PCA perform better in the other case?
Which case does Linear Regression perform well? Why does Linear Regression perform well in this case?
 
PCA performs worse in the case when there is error only in Y. Since y has noise, PCA makes errors when in tries to reduce the orthogonal distance between x and y, as the noise in y is also included.
PCA then performs better in the other case, when there is error in both x and y, as noise is present along both axes, which increases the correlation, and PCA learns better.
Linear regression on the other hand performs better when there's error only in Y, as the linear regression loss takes into consideration the gaussian noise in the prediction, which is along y-axis.
 
6 Manifold learning [Bonus for everyone][30 pts] **[W]**|**[P]**
While PCA is wonderful tool for dimensionality reduction it does not work very well when dealing with non-linear relationships between features. Manifold learning is a class of algorithms that can be used to reduce dimensions in complex high-dimensional datasets. While a number of methods have been proposed to perform this type of operation, we will focus on Isomap. Isomap has been shown to be sensitive to data noise amongst other issues, however it has been shown to perform reasonably well for real world data. The algorithm consists of two main steps: first computing a manifold distance matrix, followed by classical mutidimensional scaling. You will be creating your implementation of Isomap. In order to do so, you must read the original paper "A Global Geometric Framework for Nonlinear Dimensionality Reduction" by Tenenbaum et al. (2000), which outlines the method. You are also encouraged to read this general survey of manifold learning by Cayton (2005), where the original algorithm is further explained in a more detailed yet simplified fashion.
 
6.1 Implementation  **[P]**
 
6.1.1 pairwise distance **[P]**
In this section, you are asked to implement pairwise_dist function.

Given X∈RNxDX∈RNxD and Y∈RMxDY∈RMxD, obtain the pairwise distance matrix dist∈RNxMdist∈RNxM using the euclidean distance metric, where disti,j=||Xi−Yj||2disti,j=||Xi−Yj||2.

DO NOT USE FOR LOOP in your implementation -- they are slow and will make your code too slow to pass our grader. Use array broadcasting instead.
 
6.1.2 manifold distance matrix [10pts] **[P]**
In this section, you need to implement manifold_distance_matrix function.

Given X∈RNxDX∈RNxD and the number of the clusters KK, compute the distance matrix dist∈RNxMdist∈RNxM, where the values obey the following equations:

distij={||Xi−Yj||2,Shortest Path Distance, j∈Neighbour(i), j∉Neighbour(i).distij={||Xi−Yj||2, j∈Neighbour(i),Shortest Path Distance, j∉Neighbour(i).
Hint: For doing this part, you can partly utilize the scipy toolbox. After creating your k-nearest weighted neighbors adjacency matrix, you can convert it to a sparse graph object csr_matrix and utilize the pre-built Floyd-Warshall algorithm to compute the manifold distance matrix.
 
6.1.3 multidimensional scaling [10pts] **[P]**
In this section, you need to accomplish the multimensional_scaling part.

Given the computed distance matrix distijdistij and the size of the new reduced feature space dd, you need to return the X embedding of the new feature space.

Closed Form of the Gram Matrix with Centering:

We can now succinctly state the closed matrix form of BB by making use of the centering matrix:



B=−12CnD2CnB=−12CnD2Cn

Note: CnCn is the centering matrix with Cn=In−1n11TCn=In−1n11T and from the original distance matrix we have D2D2 = matrix with entries d2ijdij2

Find eigenvalues and eigenvectors of matrix B Since the gram matrix BB is a real symmetric, positive definite matrix, we know that it will have real eigenvalues and we can use the following eigendecomposition of B in order to find an expression for our output configuration:



B=VΛVT=(Λ12VT)T(Λ12VT)=XXT(from original def. of B)B=VΛVT=(Λ12VT)T(Λ12VT)=XXT(from original def. of B)

and therefore we have

X=VΛ−−√X=VΛ

where the eigenvalues are given by diagonal matrix Λ=diag(λ1,…,λn)Λ=diag⁡(λ1,…,λn) and the eigenvectors are given by the following matrix with the columns set as the eigenvectors V=(v1,…,vn)TV=(v1,…,vn)T

Find coordinates of output configuration We can now define a k-dimensional configuration by choosing the largest k eigenvalues and the corresponding eigenvectors from k columns of V:



Xk=VkΛk−−√Xk=VkΛk
where ΛkΛk is the k x k diagonal submatrix of ΛΛ and VkVk is the n x k submatrix of VV.
 
In the cell below implement the code for section 5.1
In [39]:
class Isomap(object):
    def __init__(self): # No need to implement
        pass
    
    def pairwise_dist(self, x, y): # [3 pts]
        """
        Args:
            x: N x D numpy array
            y: M x D numpy array
        Return:
                dist: N x M array, where dist2[i, j] is the euclidean distance between 
                x[i, :] and y[j, :]
                """
    
        raise NotImplementedError
    
    def manifold_distance_matrix(self, x, K): # 
        """
        Args:
            x: N x D numpy array
        Return:
            dist_matrix: N x N numpy array, where dist_matrix[i, j] is the euclidean distance between points if j is in the neighborhood N(i)
            or comp_adj = shortest path distance if j is not in the neighborhood N(i).
        Hint: After creating your k-nearest weighted neighbors adjacency matrix, you can convert it to a sparse graph
        object csr_matrix (https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html) and utilize
        the pre-built Floyd-Warshall algorithm (https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csgraph.floyd_warshall.html)
        to compute the manifold distance matrix.
        """

        raise NotImplementedError

    
    def multidimensional_scaling(self, dist_matrix, d): #         """
        Args:
            dist_matrix: N x N numpy array, the manifold distance matrix
            d: integer, size of the new reduced feature space 
        Return:
            S: N x d numpy array, X embedding into new feature space.
        """
        
        raise NotImplementedError

    # you do not need to change this
    def __call__(self, data, K, d):
        # get the manifold distance matrix
        W = self.manifold_distance_matrix(data, K)
        # compute the multidimensional scaling embedding
        emb_X = self.multidimensional_scaling(W, d)
        return emb_X

 
6.2 Examples for different datasets [7pts] **[W]**
Apply your implementation of Isomap for some of the datasets (e.g. MNIST and Iris). Discuss how the results compare to PCA.
In [40]:
# HELPER CELL, DO NOT MODIFY
# example MNIST data
mnist = load_digits()
proj = Isomap()(mnist.data, 10, 2)
plt.scatter(proj[:, 0], proj[:, 1], c=mnist.target, cmap=plt.cm.get_cmap('jet', 10))
plt.colorbar(ticks=range(10))

 
---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-40-a9ee22e4c0ae in <module
      2 # example MNIST data
      3 mnist = load_digits()
---- 4 proj = Isomap()(mnist.data, 10, 2)
      5 plt.scatter(proj[:, 0], proj[:, 1], c=mnist.target, cmap=plt.cm.get_cmap('jet', 10))
      6 plt.colorbar(ticks=range(10))

<ipython-input-39-352e881a55a4 in __call__(self, data, K, d)
     45     def __call__(self, data, K, d):
     46         # get the manifold distance matrix
--- 47         W = self.manifold_distance_matrix(data, K)
     48         # compute the multidimensional scaling embedding
     49         emb_X = self.multidimensional_scaling(W, d)

<ipython-input-39-352e881a55a4 in manifold_distance_matrix(self, x, K)
     28         """
     29 
--- 30         raise NotImplementedError
     31 
     32 

NotImplementedError: 
 
7 Feature Selection Implementation [No Points] **[P]**
Note: This is a fun question for you to learn about Feature Reduction. No points will be awarded for it. If you have time please go over it. It would be beneficial for your project.

Implement a method to find the final list of significant features due to forward selection and backward elimination.

Forward Selection:
In forward selection, we start with a null model, start fitting the model with one individual feature at a time, and select the feature with the minimum p-value. We continue to do this until we have a set of features where one feature's p-value is less than the confidence level.

Steps to implement it:

1: Choose a significance level (given to you).
2: Fit all possible simple regression models by considering one feature at a time.
3: Select the feature with the lowest p-value.
4: Fit all possible models with one extra feature added to the previously selected feature(s).
5: Select the feature with the minimum p-value again. if p_value < significance, go to Step 4. Otherwise, terminate.
Backward Elimination:
In backward elimination, we start with a full model, and then remove the insignificant feature with the highest p-value (that is greater than the significance level). We continue to do this until we have a final set of significant features.

Steps to implement it:

1: Choose a significance level (given to you).
2: Fit a full model including all the features.
3: Select the feature with the highest p-value. If (p_value significance level), go to Step 4, otherwise terminate.
4: Remove the feature under consideration.
5: Fit a model without this feature. Repeat entire process from Step 3 onwards.
TIP 1: The p-value is known as the observed significance value for a test hypothesis. It tests all the assumptions about how the data was generated in the model, not just the target hypothesis it was supposed to test. Some more information about p-values can be found here: https://towardsdatascience.com/what-is-a-p-value-b9e6c207247f

TIP 2: For this function, you will have to install statsmodels if not installed already. Run 'pip install statsmodels' in command line/terminal. statsmodels is a Python module that provides classes and functions for the estimation of many different statistical models, as well as for conducting statistical tests, and statistical data exploration. You will have to use this library to choose a regression model to fit your data against. Some more information about this module can be found here: https://www.statsmodels.org/stable/index.html
In [ ]:
import pandas as pd
import statsmodels.api as sm

class FeatureReduction(object):
    def __init__(self):
        pass

    @staticmethod
    def forward_selection(data, target, significance_level=0.05):
        '''
        Args:
            data: data frame that contains the feature matrix
            target: target feature to search to generate significant features
            significance_level: the probability of the event occuring by chance
        Return:
            forward_list: list containing significant features
        '''
        
        raise NotImplementedError

    @staticmethod
    def backward_elimination(data, target, significance_level = 0.05):
        '''
        Args:
            data: data frame that contains the feature matrix
            target: target feature to search to generate significant features
            significance_level: the probability of the event occuring by chance
        Return:
            backward_list: list containing significant features
        '''
        
        raise NotImplementedError

In [ ]:
# HELPER CELL, DO NOT MODIFY
boston = load_boston()
bos = pd.DataFrame(boston.data, columns = boston.feature_names)
bos['Price'] = boston.target
X = bos.drop("Price", 1)       # feature matrix 
y = bos['Price']               # target feature
featurereduction = FeatureReduction()
#Run the functions to make sure two lists are generated, one for each method
print("Features selected by forward selection:", featurereduction.forward_selection(X, y))
print("Features selected by backward selection:", featurereduction.backward_elimination(X, y))

More products