摘要:机器学习入门,数据类型
数据类型: 离散型数据类型
- 主要做分类的,不能划分的
连续型数据类型
- 主要做回归的,做连续预测,可无限划分
可用数据集
scikit-learn
结构:特征值+目标值,对应监督学习
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
def load_data():
iris = datasets.load_iris()
print("特征数据:",iris.data)
print("标签数据:",iris.target)
print("描述信息:",iris.DESCR)
print("特征名称",iris.feature_names)
print("标签名称",iris.tarfet_names)
def load_new():
news = datasets.fetch_20newsgroups(subset="all")
print(news.data)
print(news.target)
# 返回值4个参数
# 1.X_train:训练的特征数据
# 2.X_test : 测试的特征数据
# 3. y_train: 训练的目标数据
# 4. y_test:训练测试数据
# x y,
X_train,X_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25)
print(X_train)
def knn():
# data = pd.read_csv("train.csv")
# data = data[:1000000]
# data.to_csv("train_demo.csv",index=False)
# 1.读取数据
data = pd.read_csv("train_demo.csv")
# 2.将数据中的特征和目标取出
y = data["place_id"]
X = data.drop(["place_id"],axis=1)
print(X)
# 时间戳处理
# 默认unit:ns
time_value =pd.to_datetime(data["time"],unit="s")
# 2.数据划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# 3.特征工程处理(数据标准化处理)
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
# 4.选择估计器,训练模型
Kn = KNeighborsClassifier(n_neighbors=5)
# 5.训练产生模型
Kn.fit(X_train,y_train)
# 6.预测模型
y_predict = Kn.predict(X_test)
# 7.评测模型
socre = Kn.score(X_test,y_test)
print("预测的结果",y_predict)
print("准确率",socre)
def navie_bayes():
"""
朴素贝叶斯实现20组新闻数据分裂
:return:
"""
# 1. 获取数据
news = datasets.fetch_20newsgroups(subset="all")
print(news)
# 2.划分数据
X_train, X_test, y_train, y_test = train_test_split(news.data,news.target,test_size=0.25)
# 3. 处理特征工程
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
# 4. 选择估计器,训练模型
mn = MultinomialNB(alpha=1.0)
mn.fit(X_train,y_train)
y_predict = mn.predict(X_test)
# 5.评测模型
socre = mn.score(X_test,y_test)
print("预测的结果",y_predict)
print("准确率",socre)
if __name__ == '__main__':
# load_data()
# load_new()
# knn()
navie_bayes()