Toxicity Classifier in Russian

python
Published

January 1, 2024

%%capture

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

print(tf.__version__)
# !unzip archive.zip -n

# import zipfile

# with zipfile.ZipFile("archive.zip", mode="r") as archive:
#     for file in archive.namelist():
#         archive.extractall("archive/")
df = pd.read_csv("archive/labeled.csv", dtype={"toxic": np.int8})

df
comment toxic
0 Верблюдов-то за что? Дебилы, бл...\n 1
1 Хохлы, это отдушина затюканого россиянина, мол... 1
2 Собаке - собачья смерть\n 1
3 Страницу обнови, дебил. Это тоже не оскорблени... 1
4 тебя не убедил 6-страничный пдф в том, что Скр... 1
... ... ...
14407 Вонючий совковый скот прибежал и ноет. А вот и... 1
14408 А кого любить? Гоблина тупорылого что-ли? Или ... 1
14409 Посмотрел Утомленных солнцем 2. И оказалось, ч... 0
14410 КРЫМОТРЕД НАРУШАЕТ ПРАВИЛА РАЗДЕЛА Т.К В НЕМ Н... 1
14411 До сих пор пересматриваю его видео. Орамбо кст... 0

14412 rows × 2 columns

# Create data sets for defaults and non-defaults
nondefaults = df[df["toxic"] == 0]
defaults = df[df["toxic"] == 1]

# Undersample the non-defaults
nondefaults_under = nondefaults.sample(len(defaults))

# Concatenate the undersampled nondefaults with defaults
df_balanced = pd.concat(
    [nondefaults_under.reset_index(drop=True), defaults.reset_index(drop=True)], axis=0
)

# Print the value counts for loan status
print(df_balanced["toxic"].value_counts())
toxic
0    4826
1    4826
Name: count, dtype: int64
df_balanced.sample(5)
comment toxic
4317 Какие интересные крайности\n 0
3333 А-А-А-А-А-А--А-А!!!!!!!\n 1
3336 Стекло разбить. Зная наших мусоров, они даже н... 1
2571 Это неизлечимо. К старости плотность нейронов ... 1
2581 Все конечно правильно, но... За 2 минуты люди ... 0
from sklearn.model_selection import train_test_split

X = df_balanced["comment"]
y = df_balanced["toxic"]

y

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    shuffle=False,
    random_state=0,  # shuffle=True
)

y_train.value_counts()
toxic
0    4826
1    2895
Name: count, dtype: int64
from keras_preprocessing import text
from keras_preprocessing import sequence
import string

# from keras.preprocessing.sequence import pad_sequences
from natasha import Doc, MorphVocab, NewsEmbedding, NewsMorphTagger, Segmenter

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

print(morph_vocab)

max_features = 20_000


class RuTokenizer(text.Tokenizer):
    def __init__(self):
        super().__init__(self)

        self.num_words = max_features

    def tokenize(self, text):
        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)
        for token in doc.tokens:
            token.lemmatize(morph_vocab)
        # tokens = [_.lemma for _ in doc.tokens if _.text not in string.punctuation]
        tokens = [
            _.lemma.lower()
            for _ in doc.tokens
            if _.text not in string.punctuation
            and len(_.text) > 1
            and not _.text.isnumeric()
        ]

        return tokens


tokenizer = RuTokenizer()
# df_balanced["tokens"] = df_balanced["comment"].apply(
#     lambda x: repr(tokenizer.tokenize(x))
# )
# df_balanced["tokens"]
maxlen = 300

tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(sequences, maxlen=maxlen)

sequences = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(sequences, maxlen=maxlen)

model = keras.models.Sequential(
    [
        keras.layers.Embedding(max_features + 1, 128, input_length=maxlen),
        keras.layers.Bidirectional(
            keras.layers.LSTM(
                64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True
            )
        ),
        keras.layers.Bidirectional(
            keras.layers.LSTM(
                64,
                dropout=0.2,
                recurrent_dropout=0.2,
            )
        ),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)


model.summary()
MorphVocab()
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 300, 128)          2560128   
                                                                 
 bidirectional (Bidirectiona  (None, 300, 128)         98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
=================================================================
Total params: 2,757,889
Trainable params: 2,757,889
Non-trainable params: 0
_________________________________________________________________
# import keras_tuner
callback = keras.callbacks.EarlyStopping()
model.compile(keras.optimizers.Adam(0.01), "binary_crossentropy", metrics=["accuracy"])
history = model.fit(
    X_train,
    y_train,
    batch_size=32,
    epochs=2,
    validation_data=(X_test, y_test),
    callbacks=[callback],
)

len(history.history["loss"])
Epoch 1/2
154/242 [==================>...........] - ETA: 5:35 - loss: 0.4757 - accuracy: 0.7695
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
d:\My_Projects\blog\blog\interactive\bLSTM-toxicity-classifier-RU.ipynb Cell 12 line 2
      <a href='vscode-notebook-cell:/d%3A/My_Projects/blog/blog/interactive/bLSTM-toxicity-classifier-RU.ipynb#X14sZmlsZQ%3D%3D?line=0'>1</a> model.compile(keras.optimizers.Adam(0.01), "binary_crossentropy", metrics=["accuracy"])
----> <a href='vscode-notebook-cell:/d%3A/My_Projects/blog/blog/interactive/bLSTM-toxicity-classifier-RU.ipynb#X14sZmlsZQ%3D%3D?line=1'>2</a> history = model.fit(X_train, y_train, batch_size=32, epochs=2, validation_data=(X_test, y_test), callbacks=[callback])
      <a href='vscode-notebook-cell:/d%3A/My_Projects/blog/blog/interactive/bLSTM-toxicity-classifier-RU.ipynb#X14sZmlsZQ%3D%3D?line=3'>4</a> len(history.history['loss'])

File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\keras\utils\traceback_utils.py:65, in filter_traceback.<locals>.error_handler(*args, **kwargs)
     63 filtered_tb = None
     64 try:
---> 65     return fn(*args, **kwargs)
     66 except Exception as e:
     67     filtered_tb = _process_traceback_frames(e.__traceback__)

File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\keras\engine\training.py:1564, in Model.fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
   1556 with tf.profiler.experimental.Trace(
   1557     "train",
   1558     epoch_num=epoch,
   (...)
   1561     _r=1,
   1562 ):
   1563     callbacks.on_train_batch_begin(step)
-> 1564     tmp_logs = self.train_function(iterator)
   1565     if data_handler.should_sync:
   1566         context.async_wait()

File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\util\traceback_utils.py:150, in filter_traceback.<locals>.error_handler(*args, **kwargs)
    148 filtered_tb = None
    149 try:
--> 150   return fn(*args, **kwargs)
    151 except Exception as e:
    152   filtered_tb = _process_traceback_frames(e.__traceback__)

File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\eager\def_function.py:915, in Function.__call__(self, *args, **kwds)
    912 compiler = "xla" if self._jit_compile else "nonXla"
    914 with OptionalXlaContext(self._jit_compile):
--> 915   result = self._call(*args, **kwds)
    917 new_tracing_count = self.experimental_get_tracing_count()
    918 without_tracing = (tracing_count == new_tracing_count)

File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\eager\def_function.py:947, in Function._call(self, *args, **kwds)
    944   self._lock.release()
    945   # In this case we have created variables on the first call, so we run the
    946   # defunned version which is guaranteed to never create variables.
--> 947   return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
    948 elif self._stateful_fn is not None:
    949   # Release the lock early so that multiple threads can perform the call
    950   # in parallel.
    951   self._lock.release()

File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\eager\function.py:2496, in Function.__call__(self, *args, **kwargs)
   2493 with self._lock:
   2494   (graph_function,
   2495    filtered_flat_args) = self._maybe_define_function(args, kwargs)
-> 2496 return graph_function._call_flat(
   2497     filtered_flat_args, captured_inputs=graph_function.captured_inputs)

File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\eager\function.py:1862, in ConcreteFunction._call_flat(self, args, captured_inputs, cancellation_manager)
   1858 possible_gradient_type = gradients_util.PossibleTapeGradientTypes(args)
   1859 if (possible_gradient_type == gradients_util.POSSIBLE_GRADIENT_TYPES_NONE
   1860     and executing_eagerly):
   1861   # No tape is watching; skip to running the function.
-> 1862   return self._build_call_outputs(self._inference_function.call(
   1863       ctx, args, cancellation_manager=cancellation_manager))
   1864 forward_backward = self._select_forward_and_backward_functions(
   1865     args,
   1866     possible_gradient_type,
   1867     executing_eagerly)
   1868 forward_function, args_with_tangents = forward_backward.forward()

File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\eager\function.py:499, in _EagerDefinedFunction.call(self, ctx, args, cancellation_manager)
    497 with _InterpolateFunctionError(self):
    498   if cancellation_manager is None:
--> 499     outputs = execute.execute(
    500         str(self.signature.name),
    501         num_outputs=self._num_outputs,
    502         inputs=args,
    503         attrs=attrs,
    504         ctx=ctx)
    505   else:
    506     outputs = execute.execute_with_cancellation(
    507         str(self.signature.name),
    508         num_outputs=self._num_outputs,
   (...)
    511         ctx=ctx,
    512         cancellation_manager=cancellation_manager)

File d:\My_Projects\blog\blog\interactive\.venv\lib\site-packages\tensorflow\python\eager\execute.py:54, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     52 try:
     53   ctx.ensure_initialized()
---> 54   tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
     55                                       inputs, attrs, num_outputs)
     56 except core._NotOkStatusException as e:
     57   if name is not None:

KeyboardInterrupt: 
sequences = tokenizer.texts_to_sequences(
    ["Я хочу есть", "Я люблю тебя", "на работе был полный пиддес :| и так каждое за..."]
)
test = sequence.pad_sequences(sequences, maxlen=maxlen)

output = model.predict(test)

print(output.flatten())
1/1 [==============================] - 1s 1s/step
[0.5422895  0.98179567 0.01973787]
print(sequences)
model.save("model.keras")
loaded_model = keras.models.load_model("model.keras")
sequences = tokenizer.texts_to_sequences(
    ["Я хочу есть", "Я люблю тебя", "на работе был полный пиддес :| и так каждое за..."]
)
test = sequence.pad_sequences(sequences, maxlen=maxlen)

output = loaded_model.predict(test)

print(output.flatten())
1/1 [==============================] - 0s 98ms/step
[0.5422895  0.98179567 0.01973787]