机器学习(一)

2018/03/20 机器学习 阅读次数:

摘要:机器学习入门,数据类型

数据类型: 离散型数据类型

    - 主要做分类的,不能划分的

连续型数据类型

    - 主要做回归的,做连续预测,可无限划分

可用数据集

scikit-learn

结构:特征值+目标值,对应监督学习

from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import  MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer


def load_data():

    iris = datasets.load_iris()
    print("特征数据:",iris.data)
    print("标签数据:",iris.target)
    print("描述信息:",iris.DESCR)
    print("特征名称",iris.feature_names)
    print("标签名称",iris.tarfet_names)


def load_new():

    news = datasets.fetch_20newsgroups(subset="all")
    print(news.data)
    print(news.target)

    # 返回值4个参数
    # 1.X_train:训练的特征数据
    # 2.X_test : 测试的特征数据
    # 3. y_train: 训练的目标数据
    # 4. y_test:训练测试数据
    #                                                   x          y,
    X_train,X_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25)
    print(X_train)


def knn():

    # data = pd.read_csv("train.csv")
    # data = data[:1000000]
    # data.to_csv("train_demo.csv",index=False)

    # 1.读取数据
    data = pd.read_csv("train_demo.csv")
    # 2.将数据中的特征和目标取出
    y = data["place_id"]
    X = data.drop(["place_id"],axis=1)
    print(X)

    # 时间戳处理
    # 默认unit:ns
    time_value =pd.to_datetime(data["time"],unit="s")
    # 2.数据划分
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    # 3.特征工程处理(数据标准化处理)
    ss = StandardScaler()
    X_train = ss.fit_transform(X_train)
    X_test = ss.transform(X_test)
    # 4.选择估计器,训练模型
    Kn = KNeighborsClassifier(n_neighbors=5)
    # 5.训练产生模型
    Kn.fit(X_train,y_train)
    # 6.预测模型
    y_predict = Kn.predict(X_test)
    # 7.评测模型
    socre = Kn.score(X_test,y_test)
    print("预测的结果",y_predict)
    print("准确率",socre)

def navie_bayes():

    """
    朴素贝叶斯实现20组新闻数据分裂
    :return:
    """
    # 1. 获取数据
    news = datasets.fetch_20newsgroups(subset="all")
    print(news)
    # 2.划分数据
    X_train, X_test, y_train, y_test = train_test_split(news.data,news.target,test_size=0.25)

    # 3. 处理特征工程
    tfidf = TfidfVectorizer()
    X_train = tfidf.fit_transform(X_train)
    X_test = tfidf.transform(X_test)

    # 4. 选择估计器,训练模型
    mn = MultinomialNB(alpha=1.0)
    mn.fit(X_train,y_train)

    y_predict = mn.predict(X_test)
    # 5.评测模型
    socre = mn.score(X_test,y_test)
    print("预测的结果",y_predict)
    print("准确率",socre)


if __name__ == '__main__':
    # load_data()
    # load_new()
    # knn()
    navie_bayes()

Search

    Table of Contents