在机器学习领域,新闻组数据集是一个经典的数据集,用于文本分类任务。这里,我们将使用Python的scikit-learn库中的新闻组数据集,并利用朴素贝叶斯分类器来进行分类。
这里我们使用爬虫技术,爬取了5000条新闻数据,涵盖:汽车、财经、科技、健康、体育、教育、文化、军事、娱乐、时尚十个类别。每个类别500条新闻样本。
项目结构图如下:

commons.py
import pandas as pd
import jieba
# 读取数据
df_news = pd.read_table('./dataset/data.txt', names=['category', 'theme', 'URL', 'content'], encoding='utf-8')
df_news = df_news.dropna()
# 停用词处理
stopwords = pd.read_csv(r"./stopwords.txt", index_col=False, sep="\t", quoting=3, names=['stopword'], encoding='utf-8')
stopwords_list = stopwords.stopword.values.tolist()
def preprocess(text, stopwords):
words = jieba.lcut(text)
words_clean = [word for word in words if word not in stopwords]
return ' '.join(words_clean)
train.py
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer
import joblib
import matplotlib.pyplot as plt
from commons import preprocess,stopwords,stopwords_list,df_news
# 分词
content = df_news.content.values.tolist()
content_S = []
for line in content:
current_segment = jieba.lcut(line)
if len(current_segment) > 1 and current_segment != '\r\n':
content_S.append(current_segment)
# 分词并去除停用词
def drop_stopwords(contents, stopwords):
contents_clean = []
for line in contents:
line_clean = [word for word in line if word not in stopwords]
contents_clean.append(line_clean)
return contents_clean
contents_clean = drop_stopwords(content_S, stopwords_list)
# 创建并训练分类器
classifier = MultinomialNB()
# 准备训练数据
df_train = pd.DataFrame({'contents_clean': contents_clean, 'label': df_news['category']})
label_mapping = {"汽车": 1, "财经": 2, "科技": 3, "健康": 4, "体育": 5, "教育": 6, "文化": 7, "军事": 8, "娱乐": 9,
"时尚": 0}
df_train['label'] = df_train['label'].map(label_mapping)
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(df_train['contents_clean'].values, df_train['label'].values,
random_state=1)
#训练模型
def train_model():
# 将词列表转换为字符串
x_train_str = [' '.join(doc) for doc in x_train]
x_test_str = [' '.join(doc) for doc in x_test]
# 创建 CountVectorizer 对象
vec = CountVectorizer(analyzer='word', max_features=4000, lowercase=False)
# 用训练数据拟合词汇表
vec.fit(x_train_str)
# 转换训练数据和测试数据
x_train_vec = vec.transform(x_train_str)
x_test_vec = vec.transform(x_test_str)
# 创建并训练分类器
classifier = MultinomialNB()
classifier.fit(x_train_vec, y_train) # 确保在这里训练模型
# 保存模型和特征提取器
joblib.dump(classifier, './model/model.pkl')
joblib.dump(vec, './model/vectorizer.pkl')
# 预测测试集
y_pred = classifier.predict(x_test_vec)
# 评估分类器
print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# 使用交叉验证计算准确率和损失
scoring = {'accuracy': 'accuracy', 'loss': make_scorer(lambda y_true, y_pred: -accuracy_score(y_true, y_pred))}
cv_results = cross_validate(classifier, x_train_vec, y_train, cv=5, scoring=scoring, return_train_score=True)
# 训练指标展示
train_accuracy = cv_results['train_accuracy']
train_loss = -cv_results['train_loss']
test_accuracy = cv_results['test_accuracy']
test_loss = -cv_results['test_loss']
# 创建一个表格来展示性能指标
performance_metrics = pd.DataFrame({
'Fold': range(1, 6),
'Train Accuracy': train_accuracy,
'Train Loss': train_loss,
'Test Accuracy': test_accuracy,
'Test Loss': test_loss
})
print("Performance Metrics:")
print(performance_metrics)
# 绘制准确率和损失的折线图
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(range(1, 6), train_accuracy, label='Train Accuracy', marker='o')
plt.plot(range(1, 6), test_accuracy, label='Test Accuracy', marker='o')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.title('Accuracy on Training and Test Sets')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(range(1, 6), train_loss, label='Train Loss', marker='o')
plt.plot(range(1, 6), test_loss, label='Test Loss', marker='o')
plt.xlabel('Fold')
plt.ylabel('Loss')
plt.title('Loss on Training and Test Sets')
plt.legend()
plt.tight_layout()
plt.show()
if __name__ == '__main__':
train_model()
predict.py
import joblib
from commons import preprocess,stopwords,stopwords_list
# 预测函数
def predict_news_category(text, classifier, vec, stopwords):
# 预处理文本
text_clean = preprocess(text, stopwords)
# 转换为特征向量
text_vec = vec.transform([text_clean])
# 预测类别
category = classifier.predict(text_vec)[0]
# 将类别编号转换为类别名称
category_mapping = {1: "汽车", 2: "财经", 3: "科技", 4: "健康", 5: "体育", 6: "教育", 7: "文化", 8: "军事", 9: "娱乐", 0: "时尚"}
return category_mapping[category]
if __name__ == '__main__':
# 加载模型和特征提取器
classifier = joblib.load('./model/model.pkl')
vec = joblib.load('./model/vectorizer.pkl')
# 用户交互界面
print("请输入需要分类的新闻文本:")
user_input = input()
# 将用户输入的值赋给 news_text 变量
news_text = user_input
# 进行预测
predicted_category = predict_news_category(news_text, classifier, vec, stopwords_list)
print(f"预测的新闻类别是:{predicted_category}")
app.py
import joblib
from commons import preprocess, stopwords, stopwords_list
import gradio as gr
from predict import predict_news_category
def predict_news(news_text):
# 加载模型和特征提取器
classifier = joblib.load('./model/model.pkl')
vec = joblib.load('./model/vectorizer.pkl')
# 进行预测
predicted_category = predict_news_category(news_text, classifier, vec, stopwords_list)
return f"预测的新闻类别是:{predicted_category}"
demo = gr.Interface(
fn=predict_news,
title='机器学习-朴素贝叶斯新闻分类案例',
inputs=[gr.Text(label='源新闻文本')],
outputs=[gr.Text(label='预测结果')]
)
if __name__ == "__main__":
demo.launch()
运行效果:

朴素贝叶斯分类器在文本分类任务中表现得非常好,尤其是在像新闻组数据集这样具有多个类别的分类任务中。通过使用scikit-learn的管道功能,我们可以轻松地组合特征提取和模型训练,大大简化了工作流程。