%%capture
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
# !unzip archive.zip -n
# import zipfile
# with zipfile.ZipFile("archive.zip", mode="r") as archive:
# for file in archive.namelist():
# archive.extractall("archive/")
= pd.read_csv("archive/labeled.csv", dtype={"toxic": np.int8})
df
df
comment | toxic | |
---|---|---|
0 | Верблюдов-то за что? Дебилы, бл...\n | 1 |
1 | Хохлы, это отдушина затюканого россиянина, мол... | 1 |
2 | Собаке - собачья смерть\n | 1 |
3 | Страницу обнови, дебил. Это тоже не оскорблени... | 1 |
4 | тебя не убедил 6-страничный пдф в том, что Скр... | 1 |
... | ... | ... |
14407 | Вонючий совковый скот прибежал и ноет. А вот и... | 1 |
14408 | А кого любить? Гоблина тупорылого что-ли? Или ... | 1 |
14409 | Посмотрел Утомленных солнцем 2. И оказалось, ч... | 0 |
14410 | КРЫМОТРЕД НАРУШАЕТ ПРАВИЛА РАЗДЕЛА Т.К В НЕМ Н... | 1 |
14411 | До сих пор пересматриваю его видео. Орамбо кст... | 0 |
14412 rows × 2 columns
# Create data sets for defaults and non-defaults
= df[df["toxic"] == 0]
nondefaults = df[df["toxic"] == 1]
defaults
# Undersample the non-defaults
= nondefaults.sample(len(defaults))
nondefaults_under
# Concatenate the undersampled nondefaults with defaults
= pd.concat(
df_balanced =True), defaults.reset_index(drop=True)], axis=0
[nondefaults_under.reset_index(drop
)
# Print the value counts for loan status
print(df_balanced["toxic"].value_counts())
toxic
0 4826
1 4826
Name: count, dtype: int64
5) df_balanced.sample(
comment | toxic | |
---|---|---|
4317 | Какие интересные крайности\n | 0 |
3333 | А-А-А-А-А-А--А-А!!!!!!!\n | 1 |
3336 | Стекло разбить. Зная наших мусоров, они даже н... | 1 |
2571 | Это неизлечимо. К старости плотность нейронов ... | 1 |
2581 | Все конечно правильно, но... За 2 минуты люди ... | 0 |
from sklearn.model_selection import train_test_split
= df_balanced["comment"]
X = df_balanced["toxic"]
y
y
= train_test_split(
X_train, X_test, y_train, y_test
X,
y,=0.2,
test_size=False,
shuffle=0, # shuffle=True
random_state
)
y_train.value_counts()
toxic
0 4826
1 2895
Name: count, dtype: int64
from keras_preprocessing import text
from keras_preprocessing import sequence
import string
# from keras.preprocessing.sequence import pad_sequences
from natasha import Doc, MorphVocab, NewsEmbedding, NewsMorphTagger, Segmenter
= Segmenter()
segmenter = MorphVocab()
morph_vocab = NewsEmbedding()
emb = NewsMorphTagger(emb)
morph_tagger
print(morph_vocab)
= 20_000
max_features
class RuTokenizer(text.Tokenizer):
def __init__(self):
super().__init__(self)
self.num_words = max_features
def tokenize(self, text):
= Doc(text)
doc
doc.segment(segmenter)
doc.tag_morph(morph_tagger)for token in doc.tokens:
token.lemmatize(morph_vocab)# tokens = [_.lemma for _ in doc.tokens if _.text not in string.punctuation]
= [
tokens
_.lemma.lower()for _ in doc.tokens
if _.text not in string.punctuation
and len(_.text) > 1
and not _.text.isnumeric()
]
return tokens
= RuTokenizer()
tokenizer # df_balanced["tokens"] = df_balanced["comment"].apply(
# lambda x: repr(tokenizer.tokenize(x))
# )
# df_balanced["tokens"]
= 300
maxlen
tokenizer.fit_on_texts(X_train)= tokenizer.texts_to_sequences(X_train)
sequences = sequence.pad_sequences(sequences, maxlen=maxlen)
X_train
= tokenizer.texts_to_sequences(X_test)
sequences = sequence.pad_sequences(sequences, maxlen=maxlen)
X_test
= keras.models.Sequential(
model
[+ 1, 128, input_length=maxlen),
keras.layers.Embedding(max_features
keras.layers.Bidirectional(
keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True
)
),
keras.layers.Bidirectional(
keras.layers.LSTM(64,
=0.2,
dropout=0.2,
recurrent_dropout
)
),1, activation="sigmoid"),
keras.layers.Dense(
]
)
model.summary()
MorphVocab()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 300, 128) 2560128
bidirectional (Bidirectiona (None, 300, 128) 98816
l)
bidirectional_1 (Bidirectio (None, 128) 98816
nal)
dense (Dense) (None, 1) 129
=================================================================
Total params: 2,757,889
Trainable params: 2,757,889
Non-trainable params: 0
_________________________________________________________________
# import keras_tuner
= keras.callbacks.EarlyStopping() callback
compile(keras.optimizers.Adam(0.01), "binary_crossentropy", metrics=["accuracy"])
model.= model.fit(
history
X_train,
y_train,=32,
batch_size=2,
epochs=(X_test, y_test),
validation_data=[callback],
callbacks
)
len(history.history["loss"])
Epoch 1/2
154/242 [==================>...........] - ETA: 5:35 - loss: 0.4757 - accuracy: 0.7695
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) d:\My_Projects\blog\blog\interactive\bLSTM-toxicity-classifier-RU.ipynb Cell 12 line 2 <a href='vscode-notebook-cell:/d%3A/My_Projects/blog/blog/interactive/bLSTM-toxicity-classifier-RU.ipynb#X14sZmlsZQ%3D%3D?line=0'>1</a> model.compile(keras.optimizers.Adam(0.01), "binary_crossentropy", metrics=["accuracy"]) ----> <a href='vscode-notebook-cell:/d%3A/My_Projects/blog/blog/interactive/bLSTM-toxicity-classifier-RU.ipynb#X14sZmlsZQ%3D%3D?line=1'>2</a> history = model.fit(X_train, y_train, batch_size=32, epochs=2, validation_data=(X_test, y_test), callbacks=[callback]) <a href='vscode-notebook-cell:/d%3A/My_Projects/blog/blog/interactive/bLSTM-toxicity-classifier-RU.ipynb#X14sZmlsZQ%3D%3D?line=3'>4</a> len(history.history['loss']) File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\keras\utils\traceback_utils.py:65, in filter_traceback.<locals>.error_handler(*args, **kwargs) 63 filtered_tb = None 64 try: ---> 65 return fn(*args, **kwargs) 66 except Exception as e: 67 filtered_tb = _process_traceback_frames(e.__traceback__) File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\keras\engine\training.py:1564, in Model.fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing) 1556 with tf.profiler.experimental.Trace( 1557 "train", 1558 epoch_num=epoch, (...) 1561 _r=1, 1562 ): 1563 callbacks.on_train_batch_begin(step) -> 1564 tmp_logs = self.train_function(iterator) 1565 if data_handler.should_sync: 1566 context.async_wait() File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\util\traceback_utils.py:150, in filter_traceback.<locals>.error_handler(*args, **kwargs) 148 filtered_tb = None 149 try: --> 150 return fn(*args, **kwargs) 151 except Exception as e: 152 filtered_tb = _process_traceback_frames(e.__traceback__) File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\eager\def_function.py:915, in Function.__call__(self, *args, **kwds) 912 compiler = "xla" if self._jit_compile else "nonXla" 914 with OptionalXlaContext(self._jit_compile): --> 915 result = self._call(*args, **kwds) 917 new_tracing_count = self.experimental_get_tracing_count() 918 without_tracing = (tracing_count == new_tracing_count) File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\eager\def_function.py:947, in Function._call(self, *args, **kwds) 944 self._lock.release() 945 # In this case we have created variables on the first call, so we run the 946 # defunned version which is guaranteed to never create variables. --> 947 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable 948 elif self._stateful_fn is not None: 949 # Release the lock early so that multiple threads can perform the call 950 # in parallel. 951 self._lock.release() File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\eager\function.py:2496, in Function.__call__(self, *args, **kwargs) 2493 with self._lock: 2494 (graph_function, 2495 filtered_flat_args) = self._maybe_define_function(args, kwargs) -> 2496 return graph_function._call_flat( 2497 filtered_flat_args, captured_inputs=graph_function.captured_inputs) File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\eager\function.py:1862, in ConcreteFunction._call_flat(self, args, captured_inputs, cancellation_manager) 1858 possible_gradient_type = gradients_util.PossibleTapeGradientTypes(args) 1859 if (possible_gradient_type == gradients_util.POSSIBLE_GRADIENT_TYPES_NONE 1860 and executing_eagerly): 1861 # No tape is watching; skip to running the function. -> 1862 return self._build_call_outputs(self._inference_function.call( 1863 ctx, args, cancellation_manager=cancellation_manager)) 1864 forward_backward = self._select_forward_and_backward_functions( 1865 args, 1866 possible_gradient_type, 1867 executing_eagerly) 1868 forward_function, args_with_tangents = forward_backward.forward() File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\eager\function.py:499, in _EagerDefinedFunction.call(self, ctx, args, cancellation_manager) 497 with _InterpolateFunctionError(self): 498 if cancellation_manager is None: --> 499 outputs = execute.execute( 500 str(self.signature.name), 501 num_outputs=self._num_outputs, 502 inputs=args, 503 attrs=attrs, 504 ctx=ctx) 505 else: 506 outputs = execute.execute_with_cancellation( 507 str(self.signature.name), 508 num_outputs=self._num_outputs, (...) 511 ctx=ctx, 512 cancellation_manager=cancellation_manager) File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\eager\execute.py:54, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name) 52 try: 53 ctx.ensure_initialized() ---> 54 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name, 55 inputs, attrs, num_outputs) 56 except core._NotOkStatusException as e: 57 if name is not None: KeyboardInterrupt:
= tokenizer.texts_to_sequences(
sequences "Я хочу есть", "Я люблю тебя", "на работе был полный пиддес :| и так каждое за..."]
[
)= sequence.pad_sequences(sequences, maxlen=maxlen)
test
= model.predict(test)
output
print(output.flatten())
1/1 [==============================] - 1s 1s/step
[0.5422895 0.98179567 0.01973787]
print(sequences)
"model.keras") model.save(
= keras.models.load_model("model.keras") loaded_model
= tokenizer.texts_to_sequences(
sequences "Я хочу есть", "Я люблю тебя", "на работе был полный пиддес :| и так каждое за..."]
[
)= sequence.pad_sequences(sequences, maxlen=maxlen)
test
= loaded_model.predict(test)
output
print(output.flatten())
1/1 [==============================] - 0s 98ms/step
[0.5422895 0.98179567 0.01973787]