Learning notes of linear regression and logistic regression

Posted by andrewgarn on Wed, 15 Dec 2021 22:24:05 +0100

Learning source code

import numpy as np
import matplotlib.pyplot as plt 

def true_fun(X): # This is the real function we set, that is, the model of ground truth
    return 1.5*X + 0.2

np.random.seed(0) # Set random seed
n_samples = 30 # Sets the number of sampling data points

'''Generate random data as training set, and add some noise'''
X_train = np.sort(np.random.rand(n_samples)) 
y_train = (true_fun(X_train) + np.random.randn(n_samples) * 0.05).reshape(n_samples,1)

#Define model
from sklearn.linear_model import LinearRegression # Import linear regression model
model = LinearRegression() # Define model
model.fit(X_train[:,np.newaxis], y_train) # Training model
print("Output parameters w: ",model.coef_) # Output model parameter w
print("Output parameters b: ",model.intercept_) # Output parameter b
#Model test and comparison
X_test = np.linspace(0, 1, 100)
plt.plot(X_test, model.predict(X_test[:, np.newaxis]), label="Model")
plt.plot(X_test, true_fun(X_test), label="True function")
plt.scatter(X_train,y_train) # Draw the points of the training set
plt.legend(loc="best")
plt.show()

np.random.seed()

import numpy as np
def abc():
    for i in range(5):
        np.random.seed(1)
        print(np.random.rand(2))
abc()

[0.417022   0.72032449]
[0.417022   0.72032449]
[0.417022   0.72032449]
[0.417022   0.72032449]
[0.417022   0.72032449]

import numpy as np
def abc_n():
    np.random.seed(1)
    for i in range(5):
     print(np.random.rand(2))
abc_n()
[0.417022   0.72032449]
[1.14374817e-04 3.02332573e-01]
[0.14675589 0.09233859]
[0.18626021 0.34556073]
[0.39676747 0.53881673]

Set the same NP random. Seed() yields the same random number
abc_n is not the NP of the first time at the second time of the cycle random. The number calculated by seed () is different.

np.sort

sort(a, axis=-1, kind=None, order=None):
a is the array to be sorted
axis=-1 or 1, - 1 is the default, sorted by row.

import numpy as np
a=np.array(
[
    [1,2,3],
    [4,5,6],
    [9,8,7]])
a=np.sort(a)
print(a)
[[1 2 3]
 [4 5 6]
 [7 8 9]]

Set axis=0 to sort by column

import numpy as np
a=np.array(
[
   [2,1,3],
   [4,5,6],
   [0,8,7]])
a=np.sort(a,axis=0)
print(a)
[[0 1 3]
[2 5 6]
[4 8 7]]
``

## model.fit()
```python
def fit(self, X, y, sample_weight=None):
       """
       Fit linear model.

       Parameters
       ----------
       X : {array-like, sparse matrix} of shape (n_samples, n_features)
           Training data

       y : array-like of shape (n_samples,) or (n_samples, n_targets)
           Target values. Will be cast to X's dtype if necessary

       sample_weight : array-like of shape (n_samples,), default=None
           Individual weights for each sample

plt. scatter()

def scatter(
        x, y, s=None, c=None, marker=None, cmap=None, norm=None,
        vmin=None, vmax=None, alpha=None, linewidths=None,
        verts=cbook.deprecation._deprecated_parameter,
        edgecolors=None, *, plotnonfinite=False, data=None, **kwargs):

Scatter function can generate a scatter diagram.

2, Polynomial regression

source code

import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline#String different algorithms
from sklearn.preprocessing import PolynomialFeatures # Import a class that can calculate polynomial characteristics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

def true_fun(X): # This is the real function we set, that is, the model of ground truth
    return np.cos(1.5 * np.pi * X)#Π=np.pi
np.random.seed(0)
n_samples = 30 # Set random seed

X = np.sort(np.random.rand(n_samples)) 
y = true_fun(X) + np.random.randn(n_samples) * 0.1

degrees = [1, 4, 15] # Polynomial highest degree
plt.figure(figsize=(14, 5))
for i in range(len(degrees)):
    ax = plt.subplot(1, len(degrees), i + 1)
    plt.setp(ax, xticks=(), yticks=())#Define image attributes, xticks = () set x-axis label
    polynomial_features = PolynomialFeatures(degree=degrees[i],
                                             include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)]) # Using pipline tandem model
    pipeline.fit(X[:, np.newaxis], y)
    
    scores = cross_val_score(pipeline, X[:, np.newaxis], y,scoring="neg_mean_squared_error", cv=10) # Use cross validation
    X_test = np.linspace(0, 1, 100)
    plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
    plt.plot(X_test, true_fun(X_test), label="True function")
    plt.scatter(X, y, edgecolor='b', s=20, label="Samples")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.xlim((0, 1))#Display drawing range
    plt.ylim((-2, 2))
    plt.legend(loc="best")#plt. The legend() function adds a legend to the image.
    plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format(
        degrees[i], -scores.mean(), scores.std()))
plt.show()

np.random.randn(d0,d1,d2......dn)

1) When there are no parameters in the parentheses of the function, a floating-point number is returned;
2) When there is a parameter in the parentheses of the function, the array with rank 1 is returned, which cannot represent vectors and matrices;
3) When there are two or more parameters in the parentheses of the function, an array of corresponding dimensions is returned, which can represent a vector or matrix;
4)np. random. standard_ The normal () function is similar to np.random.randn(), but NP random. standard_ normal()
The input parameter of is tuple
5)np. random. The input of randn() is usually an integer, but if it is a floating-point number, it will be automatically truncated and converted to an integer.
--------
Copyright notice: This is the original article of CSDN blogger "signal excavator", which follows the CC 4.0 BY-SA copyright agreement. Please attach the original source link and this notice for reprint.
Original link: https://blog.csdn.net/qq_40130759/article/details/79535575

plt.subplot()

plt.subplot(nrows, ncols, index, **kwargs)
First parameter: nrows: row
ncols: columns
Index: index value
The second parameter: projection: {none, 'aitoff', 'hammer', 'lambert', 'mollweide', 'polar', 'rectilinear', str}, optional
The projection type of the subplot (Axes). str is the name of a costum projection, see projections. The default None results in a 'rectilinear' projection.
Optional parameters: you can select the type of subgraph. For example, polar is a pole graph. By default, none is a line graph.

#For example:
import matplotlib.pyplot as plt
plt.subplot(3,2,1)#Indicates that the interface is divided into 3 * 2 grid 1, which is the number of the figure

plt.setp() sets the object property or the value requirement of the property

The setp() function can make a drawing first and then modify the image properties

 line, = plot([1,2,3])
setp(line, linestyle='--')
#Set line to dashed line

xticks() function

Set x-axis label properties
Original function:

xticks(ticks, [labels], **kwargs)
ticks: Array type for setting X Axis scale interval
[labels]: Array type, used to set the display label of each interval
**kwargs: Used to set appearance properties such as label font inclination and color.

PolynomialFeatures()

sklearn.preprocessing. PolynomialFeatures ( degree = 2 , * , interact_only = False , include_bias = True , order = 'C' )

Generate polynomial or interactive features
If the input sample is two-dimensional and the form is [a, b], the characteristic of quadratic polynomial is [1, a, b, a^2, ab, b^2]
Degree: degree of control polynomial

interaction_only: the default value is False. If it is specified as True, there will be no items with their own characteristics and their own combination. There are no a2 and b2 in the above secondary items.

include_bias: True by default. If True, there will be the item 1 above.

cross_val_score

cross_val_score(estimator, X, y=None, *, 
groups=None, 
scoring=None, #score
cv=None, #Specify the number of folds
n_jobs=None, #Number of jobs running in parallel. The training estimator and the calculated score are parallelized on the cross validation split. None unless at joblib parallel_ In the backend context, otherwise it means 1- 1 means all processors are used.
verbose=0, #Level of detail
fit_params=None,
 pre_dispatch='2*n_jobs', 
 error_score=nan)

The data set is divided into 10 parts, each part is made into a test set, and the other parts are made into a training set.
Advantages of cross validation:
1: Cross validation is used to evaluate the prediction performance of the model, especially the performance of the trained model on new data, which can reduce the over fitting to a certain extent.
2: We can also get as much effective information as possible from the limited data.

plt.legend()

plt.legend
Here is a brief summary of some common functions:
1. Set legend position: PLT legend(loc=‘xxx’)

location string location code
'best' 0
'upper right' 1
'upper left' 2
'lower left' 3
'lower right' 4
'right' 5
'center left' 6
'center right' 7
'lower center' 8
''upper center'' 9
'center' 10
2. Set legend font size
fontsize : int or float or {'xx-small', 'x-small', 'small', 'medium', 'large', 'x-large', 'xx-large'}
The font size of the legend. If the value is a number, the size will be the absolute font size (in points). If it is a string, it is equivalent to the current default font size. This parameter is used only if prop is not specified.

3. Set legend border and background
plt.legend(loc = 'best', frameon=False) # remove the legend border
plt.legend(loc = 'best', edgecolor = 'blue') # sets the legend border color
plt.legend(loc = 'best', facecolor = 'blue') # sets the background color of the legend. If there is no border, the parameter is invalid

4. Set legend Title PLT legend(title=‘xxx’)
plt.legend(title = ('sinx ',' cosx '), the legend is shown in the figure:
--------
Copyright notice: This is the original article of CSDN blogger "humingzhu_97", which follows the CC 4.0 BY-SA copyright agreement. Please attach the original source link and this notice for reprint.
Original link: https://blog.csdn.net/humingzhu_97/article/details/104899572

logistic regression

source code

# Add a directory to the system path to facilitate the import of modules. The root directory of the project is "... / machine learning toy code"
import sys
from pathlib import Path#Path processing
curr_path = str(Path().absolute())
parent_path = str(Path().absolute().parent)
p_parent_path = str(Path().absolute().parent.parent)
sys.path.append(p_parent_path) 
print(f"The home directory is:{p_parent_path}")


from torch.utils.data import DataLoader
from torchvision import datasets
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np

train_dataset = datasets.MNIST(root = p_parent_path+'/datasets/', train = True,transform = transforms.ToTensor(), download = False)
test_dataset = datasets.MNIST(root = p_parent_path+'/datasets/', train = False, transform = transforms.ToTensor(), download = False)

batch_size = len(train_dataset)
train_loader = DataLoader(dataset=train_dataset, batch_size=100, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=100, shuffle=True)
X_train,y_train = next(iter(train_loader))
X_test,y_test = next(iter(train_loader))
# Print the first 100 pictures
images, labels= X_train[:100], y_train[:100] 
# Use images to generate a grid size with a width of 10 images
img = torchvision.utils.make_grid(images, nrow=10)
# cv2. The format of imshow() is (size1,size1,channels), while the format of img is (channels,size1,size1),
# So it needs to be used Transfer() conversion, put the number of color channels to the third dimension
img = img.numpy().transpose(1,2,0)
print(images.shape)
print(labels.reshape(10,10))
print(img.shape)
plt.imshow(img)
plt.show()

X_train,y_train = X_train.cpu().numpy(),y_train.cpu().numpy() # tensor to array)
X_test,y_test = X_test.cpu().numpy(),y_test.cpu().numpy() # tensor to array)

X_train = X_train.reshape(X_train.shape[0],784)
X_test = X_test.reshape(X_test.shape[0],784)

# solver: the optimizer used, lbfgs: Quasi Newton method, sag: random gradient descent
model = LogisticRegression(solver='lbfgs', max_iter=400) # lbfgs: Quasi Newton method
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred)) # Print report

ones_col=[[1] for i in range(len(X_train))] # Generate a two-dimensional nested list of all 1, namely [[1], [1], [1]]
X_train = np.append(X_train,ones_col,axis=1)
x_train = np.mat(X_train)
X_test = np.append(X_test,ones_col,axis=1)
x_test = np.mat(X_test)
# Mnsit has ten marks of 0-9. Since it is a two category task, it can take 0 as 1 and the rest as 0 to identify whether it is 0
y_train=np.array([1 if y_train[i]==1 else 0 for i in range(len(y_train))])
y_test=np.array([1 if y_test[i]==1 else 0 for i in range(len(y_test))])

# solver: the optimizer used, lbfgs: Quasi Newton method, sag: random gradient descent
model = LogisticRegression(solver='lbfgs', max_iter=100) # lbfgs: Quasi Newton method
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred)) # Print report

Topics: Machine Learning logistic regressive