1. 学习目的

使用Logistics Regression进行数据分类。

2. 学习要求

学习LR学习算法的核心代码
写出详细的注释说明

3. 代码实践

（1）创建数据

def generate_data(seed):
    np.random.seed(seed)
    # class1
    data_size_1 = 300
    # feature1
    x1_1 = np.random.normal(loc=5.0, scale=1.0, size=data_size_1)
    # feature2
    x2_1 = np.random.normal(loc=4.0, scale=1.0, size=data_size_1)
    y_1 = [0 for _ in range(data_size_1)]

    # class2
    data_size_2 = 400
    # feature1
    x1_2 = np.random.normal(loc=5.0, scale=2.0, size=data_size_2)
    # feature2
    x2_2 = np.random.normal(loc=4.0, scale=2.0, size=data_size_2)
    y_2 = [1 for _ in range(data_size_2)]

    # concatenate
    x1 = np.concatenate((x1_1, x1_2), axis=0)
    x2 = np.concatenate((x2_1, x2_2), axis=0)
    
    
    # 合成为一个整的数据集，变为二维矩阵
    x = np.hstack((x1.reshape(-1,1), x2.reshape(-1,1)))
    y = np.concatenate((y_1, y_2), axis=0)

    # 总的数据大小
    data_size_all = data_size_1 + data_size_2

    # 打乱数据
    shuffled_index = np.random.permutation(data_size_all)
    x = x[shuffled_index]
    y = y[shuffled_index]

    return x, y

（2）分割数据

其中80%数据用于训练，20%数据用于测试，由于数据量小，不设置验证数据集

# 数据分割，由于数据量小，不设置验证数据集
def data_split(x_data, y_data):
    # 80%数据用于训练
    train_split = int(len(y_data) * 0.8)
    x_train = x[:train_split]
    y_train = y[:train_split]
    # 20%数据用于测试
    x_test = x[train_split:]
    y_test = y[train_split:]

    return x_train, y_train, x_test, y_test

（3）构建模型

根据上文：李宏毅机器学习（四），以及大佬：王佳旭同学代码。

# Logistic Regression模型
class LogisticRegression():
    '''
    :param lr: 学习率
    :param num_iters: 更新轮数
    :param seed: 随机数种子
    '''
    def __init__(self, lr=0.1, num_iters=100, seed=None):
        self.seed = seed
        self.lr = lr
        self.num_iters = num_iters


    def fit(self, x, y):
        np.random.seed(self.seed)
        # 参数初始化w b
        self.w = np.random.normal(loc=0.0, scale=1.0, size=x.shape[1])
        self.b = np.random.normal(loc=0.0, scale=1.0)
        # 数据集
        self.x = x
        self.y = y
        # 迭代更新
        for i in range(self.num_iters):
            self._update_step()

    
    # sigmod处理
    def _sigmoid(self, z):
        return 1.0 / (1.0 + np.exp(-z))


    # 函数模型 w*x + b，经过SIGMOD处理
    def _f(self, x, w, b):
        z = x.dot(w) + b
        return self._sigmoid(z)


    # 初次预测算出概率        
    def predict_proba(self, x=None):
        if x is None:
            x = self.x
        y_pred = self._f(x, self.w, self.b)
        return y_pred


    # 再预测，根据概率分类
    def predict(self, x=None):
        if x is None:
            x = self.x
        y_pred_proba = self._f(x, self.w, self.b)
        y_pred = np.array([0 if y_pred_proba[i] < 0.5 else 1 for i in range(len(y_pred_proba))])
        return y_pred


    # 为分类进行评分
    def score(self, y_true=None, y_pred=None):
        if y_true is None or y_pred is None:
            y_true = self.y
            y_pred = self.predict()
        # 计算准确率            
        acc = np.mean([1 if y_true[i] == y_pred[i] else 0 for i in range(len(y_true))])
        return acc


    # 损失函数
    def loss(self, y_true=None, y_pred_proba=None):
        if y_true is None or y_pred_proba is None:
            y_true = self.y
            y_pred_proba = self.predict_proba()
        return np.mean(-1.0 * (y_true * np.log(y_pred_proba) + (1.0 - y_true) * np.log(1.0 - y_pred_proba)))


    # 梯度下降
    def gradient_descent(self):
        y_pred = self.predict()
        d_w = (y_pred - self.y).dot(self.x) / len(self.y)
        d_b = np.mean(y_pred - self.y)
        self.w = self.w - self.lr * d_w
        self.b = self.b - self.lr * d_b
        return self.w, self.b

（4）训练生成结果

import matplotlib.pyplot as plt

def main():
    # 生成数据
    x, y = generate_data(seed = 514)
    x_train, y_train, x_test, y_test = data_split(x, y)
    # 数据归一化
    x_train = (x_train - np.min(x_train, axis=0)) / (np.max(x_train, axis=0) - np.min(x_train, axis=0))
    x_test = (x_test - np.min(x_test, axis=0)) / (np.max(x_test, axis=0) - np.min(x_test, axis=0))
    # 逻辑斯蒂回归分类器
    clf = LogisticRegression(lr=0.1, num_iters=500, seed=514)
    clf.fit(x_train, y_train)

    # 结果可视化
    split_boundary_func = lambda x: (-clf.b - clf.w[0] * x) / clf.w[1]
    xx = np.arange(0.1, 0.6, 0.1)
    cValue = ['g','b'] 
    plt.scatter(x_train[:,0], x_train[:,1], c=[cValue[i] for i in y_train], marker='o')
    plt.plot(xx, split_boundary_func(xx), c='red')
    plt.show()

    # 测试数据集上的损失
    y_test_pred = clf.predict(x_test)
    y_test_pred_proba = clf.predict_proba(x_test)
    print(clf.score(y_test, y_test_pred))
    print(clf.loss(y_test, y_test_pred_proba))


if __name__ == '__main__':
    main()

本人在代码方面还是有所欠缺，对numpy、matplotlib的使用不熟悉。感谢王同学提供的代码。

李宏毅机器学习-代码实践1. 学习目的2. 学习要求3. 代码实践

1. 学习目的

2. 学习要求

3. 代码实践

（1）创建数据

（2）分割数据

（3）构建模型

（4）训练生成结果

继续阅读

简单文档分类——朴素贝叶斯算法朴素贝叶斯算法简单文档分类实例步骤总结朴素贝叶斯分类调用(sklearn)

【分类算法】什么是分类算法定义分类与聚类分类过程方法

分类算法的评价指标

K-近邻算法以及图像分类应用

weka之NB算法

使用weka的select attribute

weka中分类器算法

在weka中集成自己的算法

【多变量线性回归】学习记录序思路实现终

申请评分模型拒绝推断（RI）方法申请评分模型拒绝推断（RI）方法

【人工智能行业大师访谈1】吴恩达采访 Geoffery Hinton

【趋高机器视觉】机器视觉技术原理解析及解决方案

吴恩达 coursera ML 第七课总结+作业答案前言目录正文模型表示作业答案

XGBoost Plotting API以及GBDT组合特征实践 XGBoost Plotting API以及GBDT组合特征实践

解码器用于语义分割：数据依赖的解码可以实现灵活的特征聚合

2021-2025年中国运动疗法（KT）带行业市场供需与战略研究报告