Загрузить файлы в «/»

2024-06-29 11:46:49 +00:00 · 2024-06-29 11:46:49 +00:00 · ad60453b5e
commit ad60453b5e
parent 66e3302311
2 changed files with 9461 additions and 0 deletions
--- a/отзывах.ipynb
+++ b/отзывах.ipynb
--- a/отзывах.py
+++ b/отзывах.py
@ -0,0 +1,772 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# ### Загрузка исходной базы данных
+
+# In[1]:
+
+
+import pandas as pd
+
+df = pd.read_csv('База_данных_отзывы.csv', delimiter=';',header=None, usecols=[10]) # выбираем нужные столбцы
+df.columns = ['Отзывы']
+documents = df['Отзывы'].dropna().tolist()
+print(documents)
+
+
+# In[2]:
+
+
+documents.pop(0)
+print(documents[0])
+
+
+# ### Предобработка данных
+
+# In[3]:
+
+
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import string
+
+# Загрузка стоп-слов и пунктуации
+nltk.download('stopwords')
+nltk.download('punkt')
+stop_words = set(stopwords.words('russian'))
+punctuation = set(string.punctuation)
+
+preprocessed_text = []
+
+def preprocess_text(text):
+    words = word_tokenize(text.lower())  # Привести к нижнему регистру и токенизировать
+    filtered_words = [word for word in words if word not in stop_words and word not in punctuation]
+    return " ".join(filtered_words)
+
+# Пример предобработки текстовых данных
+for i in documents:
+    text = i
+    preprocessed_text.append(preprocess_text(text))
+print(preprocessed_text[10])
+
+
+# In[4]:
+
+
+import spacy
+from spacy import load
+from spacy.lang.ru.examples import sentences
+from spacy.lang.ru import Russian
+
+preprocessed_lemmatized_text = []
+
+# Загрузка языковой модели spaCy
+nlp = spacy.load("ru_core_news_md")
+
+def preprocess_and_lemmatize(text):
+    doc = nlp(text.lower())  # Привести к нижнему регистру и лемматизировать
+    lemmatized_words = [token.lemma_ for token in doc if token.text not in punctuation and token.text not in stop_words]
+    return " ".join(lemmatized_words)
+
+# Пример предобработки и лемматизации текста
+for word in preprocessed_text:
+    text = word
+    preprocessed_lemmatized_text.append(preprocess_and_lemmatize(text))
+print(preprocessed_lemmatized_text[10])
+
+
+# In[6]:
+
+
+print(len(preprocessed_lemmatized_text))
+
+
+# In[17]:
+
+
+print(preprocessed_lemmatized_text[940:950])
+
+
+# In[37]:
+
+
+import csv
+with open('out.csv', 'w', encoding="utf-8", newline='') as file:
+    writer = csv.writer(file)
+    writer.writerow(preprocessed_lemmatized_text) #сохранение в csv файл лемматизированные отзывы
+
+
+# ### Формирование TF-IDF матрицы ключевых слов
+
+# In[5]:
+
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+import pandas as pd
+
+# Пример текстовых данных
+
+
+# Создание объекта TfidfVectorizer
+tfidf_vectorizer = TfidfVectorizer()
+
+# Применение TF-IDF к текстовым данным
+tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_lemmatized_text)
+
+# Получение списка ключевых слов и их значения TF-IDF для первого документа
+feature_names = tfidf_vectorizer.get_feature_names()
+tfidf_scores = tfidf_matrix.toarray()[0]
+
+# Сортировка слов по значениям TF-IDF
+sorted_keywords = [word for _, word in sorted(zip(tfidf_scores, feature_names), reverse=True)]
+
+print("Ключевые слова:", sorted_keywords)
+
+
+# In[9]:
+
+
+tfidf_matrix.shape
+
+
+# In[9]:
+
+
+print(tfidf_matrix)
+
+
+# ### Преобразование столбца "Ценности" в числовой список для классификатора
+
+# In[23]:
+
+
+tfidf_matrix.shape
+
+
+# In[24]:
+
+
+tsvd_data2D.shape
+
+
+# In[6]:
+
+
+import pandas as pd
+
+df = pd.read_csv('База_данных_отзывы.csv', delimiter=';',header=0, usecols=[10,17]) # выбираем нужные столбцы
+
+
+# In[7]:
+
+
+df = df.dropna()
+
+
+# In[8]:
+
+
+df["Ценности"] = df["Ценности"].replace({'релевантно':1, 'не определено':0 })
+
+
+# In[9]:
+
+
+values = df['Ценности'].tolist()
+
+
+# In[16]:
+
+
+print(values[940:950])
+
+
+# In[38]:
+
+
+df["Ценности"].sum()
+
+
+# ### Классификатор случайного леса
+
+# #### Использование матрицы  TF-IDF обычной размерности
+
+# In[10]:
+
+
+from sklearn.model_selection import train_test_split
+
+x = tfidf_matrix
+y = values
+
+X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=90)
+
+
+# In[11]:
+
+
+from sklearn.ensemble import RandomForestClassifier
+
+clf = RandomForestClassifier(n_estimators=3000,max_depth=2)
+clf.fit(X_train, y_train)
+y_pred = clf.predict(X_test)
+
+
+# In[13]:
+
+
+from sklearn import metrics
+print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
+
+
+# In[12]:
+
+
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+
+print(classification_report(y_test, y_pred))
+
+
+# In[36]:
+
+
+import numpy as np
+np.array(y_test).sum()
+
+
+# In[41]:
+
+
+y_pred.shape
+
+
+# In[43]:
+
+
+y_pred
+
+
+# In[42]:
+
+
+y_pred.sum()
+
+
+# In[22]:
+
+
+tsvd_data2D.shape
+
+
+# #### Использование матрицы  TF-IDF пониженной размерности
+
+# In[14]:
+
+
+from sklearn.decomposition import TruncatedSVD
+## Понижение размерности
+tsvd2D = TruncatedSVD(n_components=1024)
+tsvd_data2D = tsvd2D.fit_transform(tfidf_matrix)
+
+
+# In[15]:
+
+
+from sklearn.model_selection import train_test_split
+
+x = tsvd_data2D
+y = values
+
+X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=90)
+
+from sklearn.ensemble import RandomForestClassifier
+
+clf = RandomForestClassifier(n_estimators=2000,max_depth=2)
+clf.fit(X_train, y_train)
+y_pred = clf.predict(X_test)
+
+
+# In[17]:
+
+
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+
+print(classification_report(y_test, y_pred))
+
+
+# In[16]:
+
+
+from sklearn import metrics
+print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
+
+
+# In[65]:
+
+
+y_pred.sum()
+
+
+# ### Метод K-ближайших соседей
+
+# #### Использование матрицы  TF-IDF обычной размерности
+
+# In[221]:
+
+
+from sklearn.model_selection import train_test_split
+
+x = tfidf_matrix
+y = values
+
+X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=90)
+
+
+# In[222]:
+
+
+from sklearn.neighbors import KNeighborsClassifier
+
+model = KNeighborsClassifier(n_neighbors = 1024)
+
+model.fit(X_train, y_train)
+y_pred = model.predict(X_test)
+
+
+# In[224]:
+
+
+
+from sklearn import metrics
+print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
+print(sum(y_pred), y_pred.shape)
+
+
+# In[215]:
+
+
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+
+print(classification_report(y_test, y_pred))
+
+
+# In[223]:
+
+
+print(sum(y_pred), y_pred.shape)
+
+
+# #### Использование матрицы  TF-IDF пониженной размерности
+
+# In[25]:
+
+
+# возьмем в качестве X матрицу пониженной размерности
+from sklearn.model_selection import train_test_split
+
+x = tsvd_data2D
+y = values
+
+X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=90)
+
+from sklearn.neighbors import KNeighborsClassifier
+
+model = KNeighborsClassifier(n_neighbors = 1024)
+
+model.fit(X_train, y_train)
+y_pred = model.predict(X_test)
+
+
+# In[226]:
+
+
+
+from sklearn import metrics
+print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
+print(sum(y_pred), y_pred.shape)
+
+
+# In[227]:
+
+
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+
+print(classification_report(y_test, y_pred))
+
+
+# ### Логистическая регрессия
+
+# #### Использование матрицы  TF-IDF обычной размерности
+
+# In[232]:
+
+
+from sklearn.datasets import load_iris
+from sklearn.linear_model import LogisticRegression
+x = tfidf_matrix
+y = values
+
+X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=90)
+clf = LogisticRegression().fit(X_train, y_train)
+y_pred = clf.predict(X_test)
+
+from sklearn import metrics
+print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
+print(sum(y_pred), y_pred.shape)
+
+
+# In[233]:
+
+
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+
+print(classification_report(y_test, y_pred))
+
+
+# #### Использование матрицы  TF-IDF пониженной размерности
+
+# In[234]:
+
+
+## возьмем в качестве X матрицу пониженной размерности
+from sklearn.datasets import load_iris
+from sklearn.linear_model import LogisticRegression
+x = tsvd_data2D
+y = values
+
+X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=90)
+clf = LogisticRegression().fit(X_train, y_train)
+y_pred = clf.predict(X_test)
+
+from sklearn import metrics
+print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
+print(sum(y_pred), y_pred.shape)
+
+
+# In[235]:
+
+
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+
+print(classification_report(y_test, y_pred))
+
+
+# ### Тематическое моделирование
+
+# #### Неотрицательная матричная факторизация (NMF)
+
+# In[19]:
+
+
+from sklearn.decomposition import NMF
+import csv
+# Подготовка данных
+X = tfidf_matrix
+
+# Применение NMF
+num_topics = 1000
+nmf = NMF(n_components=num_topics, random_state=42)
+nmf.fit(X)
+
+# Вывод слов для каждой темы
+feature_names = tfidf_vectorizer.get_feature_names()
+with open('Тематическое_моделирование_NMF.csv', 'w', encoding="utf-8", newline='') as file:
+        writer = csv.writer(file)
+        for topic_idx, topic_words in enumerate(nmf.components_):
+            top_words_idx = topic_words.argsort()[-10:][::-1]
+            top_words = [feature_names[i] for i in top_words_idx]
+            writer.writerow(top_words)
+
+
+# In[20]:
+
+
+nmf_matrix = nmf.fit_transform(X)
+
+
+# In[50]:
+
+
+nmf_tr = nmf_matrix.T
+
+
+# In[107]:
+
+
+trans = nmf_tr.T
+print(trans) # наша первоначальная матрица с уверенностью, что отзыв принадлежит теме
+
+
+# In[37]:
+
+
+tm = pd.DataFrame(nmf_matrix, columns=['Тема№'+f'{i}' for i in range(1,1001)])
+tm.head(10)
+
+
+# In[ ]:
+
+
+
+
+
+# In[38]:
+
+
+tm.insert(loc = 1000,
+          column = 'Ценности',
+          value = values)
+
+
+# In[39]:
+
+
+tm.head(10)
+
+
+# In[71]:
+
+
+q = tm[(tm['Ценности'] == 1)]
+q
+
+
+# In[72]:
+
+
+q.shape
+
+
+# In[78]:
+
+
+name = list(tm.columns)
+name.pop(-1)
+w = q[name]
+
+
+# In[83]:
+
+
+w
+
+
+# In[92]:
+
+
+res = []
+for column in w.columns:
+     
+    # Storing the rows of a column 
+    # into a temporary list
+    li = w[column].tolist()
+     
+    # appending the temporary list
+    res.append(li)
+print(len(res))
+
+
+# In[91]:
+
+
+t = [sum(nmf_tr[i]) for i in range(1000)] # суммы по темам по всем отзывам
+print(t)
+
+
+# In[94]:
+
+
+sum_1 = [sum(res[i]) for i in range(1000)] # суммы по темам по релевантным отзывам
+print(sum_1)
+
+
+# In[152]:
+
+
+import numpy as np
+p = np.array(sum_1)/np.array(t)
+print(p) # средняя вероятность по всем релевантным отзывам по темам, т.е. вероятность того, что тема соответствует культурной ценности
+
+
+# In[237]:
+
+
+pik = p[p > 0.9] # выберем те темы, где вероятность больше 0.9
+print(pik)
+
+
+# In[246]:
+
+
+inde = []
+
+for i in pik: # 
+    inde.append(p.tolist().index(i))
+print(inde) 
+
+
+# In[247]:
+
+
+with open('Тематическое_моделирование_NMF.csv',encoding="utf-8") as fd:
+    reader=csv.reader(fd)
+    interestingrows=[row for idx, row in enumerate(reader) if idx in inde]
+
+
+# In[249]:
+
+
+for i in interestingrows:
+    print(i) #рассмотрим, что это за темы по ключевым словам
+
+
+# In[173]:
+
+
+ind = [i for i in range(1,1001)]
+
+
+# In[181]:
+
+
+from matplotlib.backends.backend_pdf import PdfPages
+
+
+# In[191]:
+
+
+
+fig = plt.figure(figsize=(50,20))
+plt.xlabel('Номер Темы')
+plt.ylabel('Вероятность соотношения темы с ценностями')
+plt.bar(ind,p)
+#plt.show()
+pdf = PdfPages("Гистограмма.pdf")
+pdf.savefig(fig)
+
+
+# Сохранение файла
+pdf.close()
+
+
+# In[183]:
+
+
+min_v = min(p)
+max_v = max(p)
+print(min_v, max_v)
+
+
+# In[153]:
+
+
+itog = []
+for i in trans:
+    y = np.array(i)*np.array(p)
+    itog.append(y)
+print(itog)
+    
+
+
+# In[154]:
+
+
+np.array(itog).shape
+
+
+# In[155]:
+
+
+da = pd.DataFrame(np.array(itog), columns=['Тема№'+f'{i}' for i in range(1,1001)])
+da.head(10) # итоговый вариант
+
+
+# ### Кластеризация и категоризация текстовых данных
+
+# In[42]:
+
+
+from sklearn.cluster import KMeans
+import numpy as np
+
+import csv
+with open('кластеры_отзывов.csv', 'w', encoding="utf-8", newline='') as file:
+    writer = csv.writer(file)
+    writer.writerow(preprocessed_lemmatized_text) 
+    
+# Применение кластеризации KMeans к матрице TF-IDF
+num_clusters = 100
+kmeans = KMeans(n_clusters=num_clusters, random_state=0)
+kmeans.fit(tfidf_matrix)
+klaster = []
+with open('кластеры_отзывов.csv', 'w', encoding="utf-8", newline='') as file:
+    writer = csv.writer(file)
+# Показать примеры документов в каждом кластере
+    for cluster_id in range(num_clusters):
+        cluster_indices = np.where(kmeans.labels_ == cluster_id)[0]
+        print(f"Кластер {cluster_id + 1}:")
+        for idx in cluster_indices:
+            print(documents[idx])
+            klaster.append(documents[idx])
+        writer.writerow(klaster)
+        klaster = []
+        print("--------")
+
+
+# In[41]:
+
+
+print(documents[0])
+
+
+# ### Синонимы
+
+# In[18]:
+
+
+import bs4 as bs
+import urllib.request
+import re
+import nltk
+from nltk.corpus import stopwords
+from gensim.models import Word2Vec
+# Объединяем вcе отзывы в переменной article_text.
+article_text = ""
+for p in documents: article_text += p
+# переводим все символы в нижний регистр.
+cleaned_article = article_text.lower()
+# Оставляем только буквы и убираем пробелы, используя регулярные выражения.
+cleaned_article = re.sub('[^a-я]', ' ', cleaned_article)
+cleaned_article = re.sub(r'\s+', ' ', cleaned_article)
+# Готовим датасет для обучения
+all_sentences = nltk.sent_tokenize(cleaned_article)
+all_words = [nltk.word_tokenize(sent) for sent in all_sentences]
+# Проходимся по датасету и удаляем стоп-слова 
+for i in range(len(all_words)):
+    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('russian')]
+#Создаем модель Word2Vec со словами, чаще всего встречающимися в тексте
+word2vec = Word2Vec(all_words, min_count=3)
+print(word2vec.wv.most_similar('справедливость', topn=5))
+
+
+# In[23]:
+
+
+print(word2vec.wv.most_similar('труд', topn=100))
+
+
+# In[122]:
+
+
+import csv
+with open('словарь_слов_из_отзывов.csv', 'w', encoding="utf-8", newline='') as file:
+    writer = csv.writer(file)
+    writer.writerow(all_words)
+