From e6c3866b11e9607262e7f339bceebd208e83685b Mon Sep 17 00:00:00 2001
From: CompatHelper Julia <compathelper_noreply@julialang.org>
Date: Tue, 16 Sep 2025 23:29:50 +0000
Subject: [PATCH 1/4] CompatHelper: add new compat entry for DelimitedFiles at
 version 1, (keep existing compat)

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index 14e7007..ba10bc8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -30,6 +30,7 @@ CUDA = "3, 4, 5"
 CorpusLoaders = "0.3"
 DataDeps = "0.7"
 DataStructures = "0.18, 0.19, 0.20"
+DelimitedFiles = "1"
 Flux = "0.16, 0.17"
 Functors = "0.4, 0.5, 0.6"
 JSON = "0.21, 0.22"

From 40690c4debd5deabd92969dc9172d633b4a080a3 Mon Sep 17 00:00:00 2001
From: Roman S Samarev <rssdev10@gmail.com>
Date: Sat, 18 Oct 2025 23:18:27 +0300
Subject: [PATCH 2/4] method tokens(): added explicit TextAnalysis namespace
 use

---
 src/averagePerceptronTagger.jl | 2 +-
 src/sentiment.jl               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/averagePerceptronTagger.jl b/src/averagePerceptronTagger.jl
index a791fc4..571c52c 100644
--- a/src/averagePerceptronTagger.jl
+++ b/src/averagePerceptronTagger.jl
@@ -313,7 +313,7 @@ predict(tagger::PerceptronTagger, sd::StringDocument) =
 predict(tagger::PerceptronTagger, fd::FileDocument) =
         predict(tagger, text(fd))
 predict(tagger::PerceptronTagger, td::TokenDocument) =
-        predict(tagger, tokens(td))
+        predict(tagger, TextAnalysis.tokens(td))
 function predict(tagger::PerceptronTagger, ngd::NGramDocument)
     @warn "POS tagging for NGramDocument not available."
 end
diff --git a/src/sentiment.jl b/src/sentiment.jl
index 4bcffd6..212b64f 100644
--- a/src/sentiment.jl
+++ b/src/sentiment.jl
@@ -94,5 +94,5 @@ Predict sentiment of the input doc in range 0 to 1, 0 being least sentiment scor
 -  handle_unknown   = A function for handling unknown words. Should return an array (default x->tuple())
 """
 function(m::SentimentAnalyzer)(d::AbstractDocument, handle_unknown = x->tuple())
-    m.model(handle_unknown, tokens(d))
+    m.model(handle_unknown, TextAnalysis.tokens(d))
 end

From 061781e57da3f605421202d42ad8cfc1f02f8b1e Mon Sep 17 00:00:00 2001
From: Roman S Samarev <rssdev10@gmail.com>
Date: Sat, 18 Oct 2025 23:01:09 +0300
Subject: [PATCH 3/4] fixed documentation

---
 docs/Project.toml              |   2 +
 docs/src/APIReference.md       |   7 +
 docs/src/ULMFiT.md             | 157 +++----
 docs/src/crf.md                | 158 ++++---
 docs/src/index.md              |   2 +-
 docs/src/ner.md                |  96 ++---
 docs/src/sentiment.md          |  10 +-
 docs/src/tagging.md            | 137 +++---
 src/CRF/loss.jl                |  60 ++-
 src/averagePerceptronTagger.jl | 752 +++++++++++++++++----------------
 src/sequence/ner.jl            |  28 ++
 src/sequence/pos.jl            |  33 ++
 12 files changed, 763 insertions(+), 679 deletions(-)

diff --git a/docs/Project.toml b/docs/Project.toml
index 87eab8e..3057e5d 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,4 +1,6 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
 TextModels = "77b9cbda-2a23-51df-82a3-24144d1cd378"
+WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
diff --git a/docs/src/APIReference.md b/docs/src/APIReference.md
index d01cbef..09c049f 100644
--- a/docs/src/APIReference.md
+++ b/docs/src/APIReference.md
@@ -4,3 +4,10 @@
 Modules = [TextModels, TextModels.ULMFiT]
 Order   = [:function, :type]
 ```
+
+## Constructor Functions
+
+```@docs
+NERTagger
+PoSTagger
+```
diff --git a/docs/src/ULMFiT.md b/docs/src/ULMFiT.md
index 89622d1..48a28e2 100644
--- a/docs/src/ULMFiT.md
+++ b/docs/src/ULMFiT.md
@@ -1,10 +1,10 @@
 # ULMFiT
 
-This is the implementation of [Universal Language Model Fine-tuning for Text Classification](https://arxiv.org/pdf/1801.06146.pdf) paper released by the Jeremy Howard and Sebastian Ruder. The model can be used for several classification tasks in Natural Language Processing domain. The model follows the concept of [Transfer learning](https://en.wikipedia.org/wiki/Transfer_learning). Here, the model was trained to perform Sentiment Analysis task. The weights for that is also provided and also the weights for the Language model part of the ULMFiT is provided so that it can be used to fine-tune the model for different tasks.
+This is the implementation of the [Universal Language Model Fine-tuning for Text Classification](https://arxiv.org/pdf/1801.06146.pdf) paper released by Jeremy Howard and Sebastian Ruder. The model can be used for several classification tasks in the Natural Language Processing domain. The model follows the concept of [Transfer learning](https://en.wikipedia.org/wiki/Transfer_learning). Here, the model was trained to perform a Sentiment Analysis task. The weights for this are also provided, as well as the weights for the Language model part of ULMFiT, so that it can be used to fine-tune the model for different tasks.
 
 ## Data Loading and Preprocessing
 
-Proper preprocessing is essential before start training ULMFiT. For pretraining step for Language model, a general-purpose corpus is needed, which here is WikiText-103 by default. Similarly, for fine-tuning Language Model and fine-tuning classifier we need a dataset for the specific task (example IMDB for Sentiment Analysis, large scale AG news and DBpedia ontology datasets for Topic classification etc). To load data for these steps, data loaders are needed to be defined. Since the data used to train for such a large model is large, so it is not recommended to load all the data at once, instead the data should be loaded in batches through concept of tasks (or coroutines) in Julia (Refer [this](https://docs.julialang.org/en/v1.0/manual/control-flow/#man-tasks-1) documentation for understanding tasks in Julia) using `Channels`. Basically, we need to create `Channel` which supply a mini-batch at every call. As example the functions used for preprocessing of the IMDB dataset used is given in the `data_loaders.jl` in ULMFiT directory. Also, for loading WikiText-103 dataset and IMDB dataset default functions are provided in same file.
+Proper preprocessing is essential before starting training ULMFiT. For the pretraining step for Language model, a general-purpose corpus is needed, which here is WikiText-103 by default. Similarly, for fine-tuning Language Model and fine-tuning classifier we need a dataset for the specific task (example IMDB for Sentiment Analysis, large scale AG news and DBpedia ontology datasets for Topic classification etc). To load data for these steps, data loaders need to be defined. Since the data used to train such a large model is large, it is not recommended to load all the data at once, instead the data should be loaded in batches through the concept of tasks (or coroutines) in Julia (Refer to [this](https://docs.julialang.org/en/v1.0/manual/control-flow/#man-tasks-1) documentation for understanding tasks in Julia) using `Channels`. Basically, we need to create a `Channel` which supplies a mini-batch at every call. As an example, the functions used for preprocessing of the IMDB dataset are given in the `data_loaders.jl` in the ULMFiT directory. Also, for loading WikiText-103 dataset and IMDB dataset, default functions are provided in the same file.
 
 Default data loaders are provided in the `data_loaders.jl`:
 
@@ -12,11 +12,11 @@ Default data loaders are provided in the `data_loaders.jl`:
  * `imdb_fine_tune_data`  : returns `Channel` for loading fine-tuning data from IMDb movie review dataset
  * `imdb_classifier_data` : returns `Channel` for loading classification data from IMDB movie review dataset for binary sentiment analysis
 
- To make custom loaders, have a look into these functions. These will give clear idea of preparation of batches inside data loaders.
+ To make custom loaders, have a look at these functions. These will give a clear idea of the preparation of batches inside data loaders.
 
-## Step 1 - Pre-training Language Model
+## Step 1 - Pretraining Language Model
 
-In this step, Language Model will learn the general properties of the Language. To train the model we need a general domain corpus like WikiText-103. For training, a `generator` function is provided to create a `Channel` which will give mini-batch in every call. After pre-processing the corpus, the tokenized corpus is given as input to the generator function and the Channel can be created like so:
+In this step, the Language Model will learn the general properties of the language. To train the model we need a general domain corpus like WikiText-103. For training, a `generator` function is provided to create a `Channel` which will give a mini-batch in every call. After pre-processing the corpus, the tokenized corpus is given as input to the generator function and the Channel can be created like so:
 ```julia
 julia> loader = ULMFiT.imdb_fine_tune_data(4, 10) # batchsize=4, bptt=10
 Channel{Any}(0) (1 item available)
@@ -51,17 +51,17 @@ julia> Y = take!(loader)
  ["say", "office", ",", "a"]
 
 ```
-Note that at the first call to this `Channel` the output will be maximum number of batches which it can give. Two calls to this `Channel` completed one batch, that is, it doesnot give `X` and `Y` both together in one call, two calls are needed, one first `X` is given out and in second `Y`. Also, to understand what are `batchsize` and `bptt`, refer this [blog](https://nextjournal.com/ComputerMaestro/jsoc19-practical-implementation-of-ulmfit-in-julia-2).
+Note that at the first call to this `Channel` the output will be the maximum number of batches which it can give. Two calls to this `Channel` complete one batch, that is, it does not give `X` and `Y` both together in one call, two calls are needed: first `X` is given out and in the second call `Y`. Also, to understand what `batchsize` and `bptt` are, refer to this [blog](https://nextjournal.com/ComputerMaestro/jsoc19-practical-implementation-of-ulmfit-in-julia-2).
 
-### Training Language Model:
+### Training Language Model
 
-File `pretrain_lm.jl` contains the whole implementation of the `LanguageModel`. To start training, first, create an instance of `LanguageModel` type, then use the below specified function with appropriate arguments.
+The file `pretrain_lm.jl` contains the whole implementation of the `LanguageModel`. To start training, first create an instance of the `LanguageModel` type, then use the below specified function with appropriate arguments.
 
 ```julia
 julia> lm = LanguageModel()
 ```
 
-It has several arguments to defined the internal structure of the `LanguageModel` instance:
+It has several arguments to define the internal structure of the `LanguageModel` instance:
 [All are keyword arguments and optional]
 
  * `embedding_size`      : defines size of embeddings for embedding matrix in `DroppedEmbeddings` layer (default value is 400)
@@ -84,14 +84,14 @@ pretrain_lm!(lm::LanguageModel=LanguageModel(),
 
 Positional Arguments:
 
- * `lm`               : instance of `LanguageModel struct`
+ * `lm`               : instance of `LanguageModel` struct
  * `data_loader`      : this `Channel` is created to load the data from the general-domain corpus
 
 Keyword Arguments:
 
- * `base_lr`          : learning rate for `ADAM` optimizers
+ * `base_lr`          : learning rate for `ADAM` optimizer
  * `epochs`           : number of epochs
- * `checkpoint_itvl`  : Stands for Checkpoint interval, interval of number of iterations after which the model weights are saved to a specified BSON file
+ * `checkpoint_itvl`  : Checkpoint interval, the number of iterations after which the model weights are saved to a specified BSON file
 
 [All default values shown above]
 
@@ -99,11 +99,11 @@ To know the full implementation of the `LanguageModel`, `AWD_LSTM` layer and `Dr
 
 ## Step 2 - Fine-tuning Language Model
 
-In this step, the Language Model pretrained in the last step, will be fine-tuned on the target data of the downstream task (e.g. sentiment analysis). Again preprocess the text data from the dataset and create a `Channel` using the `generator` function. `fine_tune_lm.jl` contains all the functions related to fine-tuning of the Language model.
+In this step, the Language Model pretrained in the last step will be fine-tuned on the target data of the downstream task (e.g. sentiment analysis). Again, preprocess the text data from the dataset and create a `Channel` using the `generator` function. The file `fine_tune_lm.jl` contains all the functions related to fine-tuning of the Language model.
 
-### Fine-tune Language model:
+### Fine-tune Language Model
 
-`fine_tune_lm!` function is used to fine-tune a Language Model:
+The `fine_tune_lm!` function is used to fine-tune a Language Model:
 
 ```julia
 fine_tune_lm!(lm::LanguageModel=load_lm(),
@@ -118,7 +118,7 @@ fine_tune_lm!(lm::LanguageModel=load_lm(),
 
 Positional Arguments:
 
- * `lm`               : Instance of `LanguageModel struct`
+ * `lm`               : Instance of `LanguageModel` struct
  * `data_loader`      : `Channel` created to load mini-batches from target data
 
 Keyword Arguments:
@@ -126,21 +126,21 @@ Keyword Arguments:
  * `stlr_cut_frac`    : In STLR, it is the fraction of iterations for which LR is increased
  * `stlr_ratio`       : In STLR, it specifies how much smaller is lowest LR from maximum LR
  * `stlr_η_max`       : In STLR, this is the maximum LR value
- * `epochs`           : It is simply the number of epochs for which the language model is to be fine-tuned
- * `checkpoint_itvl`  : Stands for Checkpoint interval, interval of number of iterations after which the model weights are saved to a specified BSON file
+ * `epochs`           : the number of epochs for which the language model is to be fine-tuned
+ * `checkpoint_itvl`  : Checkpoint interval, the number of iterations after which the model weights are saved to a specified BSON file
 
 [All default values shown above]
 By default the `fine_tune_lm!` function will load a pretrained model if a `LanguageModel` instance is not provided.
 
-In fine-tuning step, some additional techniques are used to for training, namely, Discriminative fine-tuning and Slanted triangular learning rates (STLR). To know there implementation refer [this](https://nextjournal.com/ComputerMaestro/jsoc19-practical-implementation-of-ulmfit-in-julia-3) blog.
+In the fine-tuning step, some additional techniques are used for training, namely, Discriminative fine-tuning and Slanted triangular learning rates (STLR). To know their implementation refer to [this](https://nextjournal.com/ComputerMaestro/jsoc19-practical-implementation-of-ulmfit-in-julia-3) blog.
 
 ## Step 3 - Fine-tuning the classifier for downstream task
 
-This is the final step of training ULMFiT model for a specifc task. Here, two linear blocks will be in addition with the Language model layers. These are `PooledDense` and `Dense`. To know more about them go through [this](https://nextjournal.com/ComputerMaestro/jsoc19-practical-implementation-of-ulmfit-in-julia-5) blog post.
+This is the final step of training the ULMFiT model for a specific task. Here, two linear blocks will be added to the Language model layers. These are `PooledDense` and `Dense`. To know more about them go through [this](https://nextjournal.com/ComputerMaestro/jsoc19-practical-implementation-of-ulmfit-in-julia-5) blog post.
 
 ### Fine-tune text classifier
 
-Before start of training, it is required to make an instance of the `TextClassifier` type like so:
+Before starting training, it is required to make an instance of the `TextClassifier` type like so:
 
 ```julia
 julia> classifier = TextClassifier()
@@ -154,7 +154,7 @@ Arguments:
   * `clsfr_hidden_sz`      : hidden `PooledDense` layer size of classifier [default value is 50]
   * `clsfr_hidden_drop`    : dropout probability for the `PooledDense` layer [hidden layer] of classifier [default value is 0.4]
 
-To start training use `train_classifier!` function:
+To start training, use the `train_classifier!` function:
 
 ```julia
 train_classifier!(classifier::TextClassifier=TextClassifier(),
@@ -173,10 +173,10 @@ train_classifier!(classifier::TextClassifier=TextClassifier(),
 
 Positional Arguments:
 
- * `lm`               : Instance of `LanguageModel struct`
+ * `lm`               : Instance of `LanguageModel` struct
  * `classes`          : Size of output layer for classifier or number of classes for which the classifier is to be trained
- * `data_loader`     : `Channel` created to load mini-batches for classification
- * `hidden_layer_size`: Size of the hidden linear layer added for making classifier
+ * `data_loader`      : `Channel` created to load mini-batches for classification
+ * `hidden_layer_size`: Size of the hidden linear layer added for making the classifier
 
 Keyword Arguments:
 
@@ -184,19 +184,19 @@ Keyword Arguments:
  * `stlr_ratio`       : In STLR, it specifies how much smaller is lowest LR from maximum LR
  * `stlr_η_max`       : In STLR, this is the maximum LR value
  * `val_loader`       : `Channel` which will load the cross validation set as mini-batches same as `data_loader`
- * `cross_val_batches`: number of cross validation batches for the accuracy and loss will be printed
- * `epochs`           : It is simply the number of epochs for which the language model is to be fine-tuned
- * `checkpoint_itvl`  : Stands for Checkpoint interval, interval of number of iterations after which the model weights are saved to a specified BSON file
+ * `cross_val_batches`: number of cross validation batches for which the accuracy and loss will be printed
+ * `epochs`           : the number of epochs for which the language model is to be fine-tuned
+ * `checkpoint_itvl`  : Checkpoint interval, the number of iterations after which the model weights are saved to a specified BSON file
 
-[All defaults values are shown above]
+[All default values are shown above]
 
 ## Layers
 
-There are some custom layers added for this model to work properly. All of them are described below, go though all of them to have a better understanding of the model.
+There are some custom layers added for this model to work properly. All of them are described below; go through all of them to have a better understanding of the model.
 
 ### Weight-Dropped LSTM (WeightDroppedLSTM)
 
-This is basically a modification to the original LSTM layer. The layer uses [DropConnect](http://yann.lecun.com/exdb/publis/pdf/wan-icml-13.pdf) with [Variational-dropping](https://arxiv.org/abs/1506.02557) concepts. In which, the hidden-to-hidden weights and input-to-hidden weights can be dropped randomly for given probability. That means, the layer uses the same drop mask for all timesteps and to do this, the layer saves the masks. To change the mask `reset_masks!` function should be used.
+This is basically a modification to the original LSTM layer. The layer uses [DropConnect](http://yann.lecun.com/exdb/publis/pdf/wan-icml-13.pdf) with [Variational-dropping](https://arxiv.org/abs/1506.02557) concepts. In this approach, the hidden-to-hidden weights and input-to-hidden weights can be dropped randomly for a given probability. That means the layer uses the same drop mask for all timesteps and to do this, the layer saves the masks. To change the mask, the `reset_masks!` function should be used.
 
 ```julia
 # maskWi and maskWh are drop masks for Wi and Wh weights
@@ -208,13 +208,10 @@ julia> wd = ULMFiT.WeightDroppedLSTM(4, 5, 0.3);
 
 # Pass
 julia> x = rand(4);
-julia> h = wd(x)
-5×1 Matrix{Float64}:
-  0.17602923394922002
-  0.08615001440875035
-  0.015924513976372016
-  0.10526862977034518
- -0.04417581280319146
+julia> state = wd(x)
+(([0.6103202972778569; 0.6103202972778569; 0.6103202972778569; 0.6103202972778569; 0.6103202972778569;;], [0.7274433138892508; 0.7274433138892508; 0.7274433138892508; 0.7274433138892508; 0.7274433138892508;;]), [0.6103202972778569; 0.6103202972778569; 0.6103202972778569; 0.6103202972778569; 0.6103202972778569;;])
+
+# The output is a tuple containing ((hidden_state, cell_state), output)
 
 # To reset_masks!
 julia> ULMFiT.reset_masks!(wd)
@@ -239,63 +236,37 @@ julia> awd.T
 
 # Pass
 julia> x = rand(3);
-julia> h = awd(x)
-4×1 Matrix{Float64}:
-  0.15229648590284084
- -0.05929450272853615
- -0.06110043118692251
-  0.15302430271141032
+julia> state = awd(x)
+(([0.6103202972778569; 0.6103202972778569; 0.6103202972778569; 0.6103202972778569;;], [0.7274433138892508; 0.7274433138892508; 0.7274433138892508; 0.7274433138892508;;]), [0.6103202972778569; 0.6103202972778569; 0.6103202972778569; 0.6103202972778569;;])
 
  # Resetting drop masks
-julia> awd.layer.cell.maskWi
-16×3 Matrix{Float32}:
- 0.0  0.0  0.0
- 2.0  0.0  0.0
- 0.0  2.0  0.0
- 0.0  0.0  0.0
- 2.0  2.0  2.0
- 0.0  2.0  0.0
- 2.0  0.0  2.0
- 2.0  2.0  2.0
- 2.0  0.0  0.0
- 0.0  0.0  2.0
- 2.0  0.0  0.0
- 2.0  0.0  2.0
- 0.0  2.0  0.0
- 0.0  2.0  0.0
- 2.0  2.0  2.0
- 2.0  2.0  2.0
+julia> awd.layer.cell.maskWi[1:5, :]
+3×3 Matrix{Float32}:
+ 1.21938  1.21039  1.73037
+ 0.0      0.0      0.0
+ 0.0      1.35731  1.04278
+ 0.0      1.058    0.0
+ 0.0      1.1709   1.12383
 
 julia> ULMFiT.reset_masks!(awd)
-julia> awd.layer.cell.maskWi
-16×3 Matrix{Float32}:
- 0.0  2.0  0.0
- 0.0  2.0  0.0
- 0.0  0.0  0.0
- 2.0  2.0  0.0
- 2.0  2.0  2.0
- 2.0  2.0  2.0
- 0.0  2.0  0.0
- 2.0  2.0  0.0
- 2.0  0.0  2.0
- 0.0  0.0  2.0
- 2.0  0.0  0.0
- 2.0  2.0  2.0
- 0.0  0.0  2.0
- 0.0  2.0  2.0
- 2.0  0.0  2.0
- 0.0  0.0  2.0
+julia> awd.layer.cell.maskWi[1:5, :]
+3×3 Matrix{Float32}:
+ 1.22731  0.0      1.95435
+ 0.0      1.28479  0.0
+ 0.0      1.51710  1.41950
+ 1.68891  0.0      0.0
+ 0.0      0.0      0.0
 ```
 
 ### Variational-DropOut (VarDrop)
 
-This layer applis Variational-DropOut, which is, using same dropout mask till it is not specified to change or till a pass is over. This dropout is useful for recurrent layers since these layers perform better if same mask is used for all time-steps (pass) instead of using different for every timestep. [Refer [this](https://arxiv.org/pdf/1506.02557.pdf) paper for more details]. This layer saves the masks after generation till it is not specified to change. To change the mask use `reset_masks!` function.
+This layer applies Variational-DropOut, which is using the same dropout mask until it is specified to change or until a pass is over. This dropout is useful for recurrent layers since these layers perform better if the same mask is used for all time-steps (pass) instead of using different masks for every timestep. [Refer to [this](https://arxiv.org/pdf/1506.02557.pdf) paper for more details]. This layer saves the masks after generation until it is specified to change. To change the mask use the `reset_masks!` function.
 
 ```julia
 julia> vd = ULMFiT.VarDrop(0.5)
 VarDrop{Float64}(0.5, Matrix{Float32}(undef, 0, 0), true, true)
 
-# No mask generation will nothing is passed
+# No mask generation when nothing is passed
 julia> vd.mask
 0×0 Matrix{Float32}
 
@@ -324,7 +295,7 @@ julia> vd.mask
 
 ### Dropped Embeddings (DroppedEmbeddings)
 
-This layer is an embedding layer which can work in two ways either to give embeddings Vectors for the given indices of words in vocabulary or can be used to get probability distribution for all the words of vocabulary with softmax layer, which is also called as weight-tying. Here, it can be used to tie weights of the embedding layer and the last softmax layer. In addition to this, it also dropped embeddings for words randomly for given probability of dropping, in other words, it puts whole embedding vector of randomly selects to vector of zeros. Here, the mask used for the dropping posses variational property, that is, it cannot be changed till it is not specified to change or generate a new drop mask. `reset_masks!` should be used to reset the mask.
+This layer is an embedding layer which can work in two ways: either to give embedding vectors for the given indices of words in vocabulary or can be used to get probability distribution for all the words of vocabulary with a softmax layer, which is also called weight-tying. Here, it can be used to tie weights of the embedding layer and the last softmax layer. In addition to this, it also drops embeddings for words randomly for a given probability of dropping; in other words, it puts the whole embedding vector of randomly selected words to a vector of zeros. Here, the mask used for the dropping possesses a variational property, that is, it cannot be changed until it is specified to change or generate a new drop mask. `reset_masks!` should be used to reset the mask.
 
 ```julia
 julia> fieldnames(ULMFiT.DroppedEmbeddings)
@@ -336,26 +307,26 @@ julia> de = ULMFiT.DroppedEmbeddings(5, 2, 0.3);
 julia> x = [4,2,1];
 julia> embeddings = de(x)
 2×3 transpose(::Matrix{Float32}) with eltype Float32:
-  0.363157  -0.0246867  -0.332342
- -0.553211  -0.594884    0.184288
+ -0.0805887  0.0  0.0
+ -0.147202  -0.0 -0.0
 
 julia> de.mask
 5-element Vector{Float32}:
- 1.4285715
- 1.4285715
- 1.4285715
- 1.4285715
- 1.4285715
+ 0.0
+ 0.0
+ 0.0
+ 0.87639743
+ 0.0
 
 # reset mask
 julia> reset_masks!(de)
 julia> de.mask
 5-element Vector{Float32}:
- 1.4285715
- 1.4285715
- 1.4285715
+ 0.0
+ 0.6529015
  0.0
  0.0
+ 0.5247689
 ```
 
 ### Concat-Pooled Dense layer
diff --git a/docs/src/crf.md b/docs/src/crf.md
index 8ce9aad..a7537bd 100644
--- a/docs/src/crf.md
+++ b/docs/src/crf.md
@@ -2,130 +2,128 @@
 
 This package currently provides support for Linear Chain Conditional Random Fields.
 
-Let us first load the dependencies-
+Let us first load the dependencies:
 
-    using Flux
-    using Flux: onehot, LSTM, Dense, reset!
-    using TextModels: CRF, viterbi_decode, crf_loss
-
-Conditional Random Field layer is essentially like a softmax that operates on the top most layer.
+```@example crf
+using Flux
+using Flux: onehot, LSTM, Dense, reset!
+using TextModels: CRF, viterbi_decode, crf_loss
+nothing # hide
+```
 
-Let us suppose the following input sequence to the CRF with `NUM_LABELS = 2`
+Conditional Random Field layer is essentially like a softmax layer that operates on the top-most layer.
 
-```julia
-julia> NUM_LABELS = 2
-julia> SEQUENCE_LENGTH = 2 # CRFs can handle variable length inputs sequences
-julia> input_seq = [Float32.(rand(NUM_LABELS + 2)) for i in 1:SEQUENCE_LENGTH] # NUM_LABELS + 2, where two extra features correspond to the :START and :END label.
-2-element Vector{Vector{Float32}}:
- [0.5114323, 0.5355139, 0.4011792, 0.56359255]
- [0.22925346, 0.21232551, 0.77616125, 0.41560093]
+Let us suppose the following input sequence to the CRF with `NUM_LABELS = 2`:
 
+```@example crf
+using Random
+Random.seed!(42) # For reproducible documentation
+NUM_LABELS = 2
+SEQUENCE_LENGTH = 3 # CRFs can handle variable length inputs sequences
+input_seq = [rand(NUM_LABELS + 2) for i in 1:SEQUENCE_LENGTH] # NUM_LABELS + 2, where two extra features correspond to the :START and :END label.
 ```
 
-We define our crf layer as -
+We define our CRF layer as:
 
     CRF(NUM_LABELS::Integer)
 
-```julia
-julia> c = CRF(NUM_LABELS) # The API internally append the START and END tags to NUM_LABELS.
-CRF with 4 distinct tags (including START and STOP tags).
+```@example crf
+c = CRF(NUM_LABELS) # The API internally append the START and END tags to NUM_LABELS.
 ```
 
-Now as for the initial variable in Viterbi Decode or Forward Algorithm,
-we define our input as
+Now for the initial variable in Viterbi Decode or Forward Algorithm,
+we define our input as:
 
-```julia
-julia>  init_α = fill(-10000, (c.n + 2, 1))
-julia>  init_α[c.n + 1] = 0
+```@example crf
+init_α = fill(-10000, (c.n + 2, 1))
+init_α[c.n + 1] = 0
+init_α
 ```
 
 Optionally this could be shifted to GPU by `init_α = gpu(init_α)`,
-considering the input sequence to be CuArray in this case.
-To shift a CRF `c` to gpu, one can use `c = gpu(c)`.
+considering the input sequence to be a CuArray in this case.
+To shift a CRF `c` to GPU, one can use `c = gpu(c)`.
 
-To find out the crf loss, we use the following function -
+To find the CRF loss, we use the following function:
 
     crf_loss(c::CRF, input_seq, label_sequence, init_α)
 
-```
-julia> label_seq1 = [onehot(1, 1:2), onehot(1, 1:2)]
-
-julia> label_seq2 = [onehot(1, 1:2), onehot(2, 1:2)]
-
-julia> label_seq3 = [onehot(2, 1:2), onehot(1, 1:2)]
+```@example crf
+using Flux: onehot
+label_seq1 = [onehot(1, 1:2), onehot(1, 1:2), onehot(1, 1:2)]
+label_seq2 = [onehot(1, 1:2), onehot(1, 1:2), onehot(2, 1:2)]
+label_seq3 = [onehot(2, 1:2), onehot(1, 1:2), onehot(1, 1:2)]
+label_seq4 = [onehot(2, 1:2), onehot(2, 1:2), onehot(2, 1:2)]
 
-julia> label_seq4 = [onehot(2, 1:2), onehot(2, 1:2)]
-
-julia> crf_loss(c, input_seq, label_seq1, init_α)
-1.33554f0
-
-julia> crf_loss(c, input_seq, label_seq2, init_α)
-1.2327178f0
+crf_loss(c, input_seq, label_seq1, init_α)
+```
 
-julia> crf_loss(c, input_seq, label_seq3, init_α)
-1.3454239f0
+```@example crf
+crf_loss(c, input_seq, label_seq2, init_α)
+```
 
-julia> crf_loss(c, input_seq, label_seq4, init_α)
-1.6871009f0
+```@example crf
+crf_loss(c, input_seq, label_seq3, init_α)
+```
 
+```@example crf
+crf_loss(c, input_seq, label_seq4, init_α)
 ```
 
-We can decode this using Viterbi Decode.
+We can decode this using Viterbi Decode:
 
     viterbi_decode(c::CRF, input_seq, init_α)
 
-```julia
-julia> viterbi_decode(c, input_seq, init_α) # Gives the label_sequence with least loss
-2-element Vector{Flux.OneHotArray{UInt32, 2, 0, 1, UInt32}}:
- [1, 0]
- [0, 1]
-
+```@example crf
+viterbi_decode(c, input_seq, init_α) # Gives the label_sequence with least loss
 ```
 
-This algorithm decodes for the label sequence with lowest loss value in polynomial time.
+This algorithm decodes the label sequence with the lowest loss value in polynomial time.
 
-Currently the Viterbi Decode only support cpu arrays.
-When working with GPU, use viterbi_decode as follows
+Currently the Viterbi Decode only supports CPU arrays.
+When working with GPU, use viterbi_decode as follows:
 
     viterbi_decode(cpu(c), cpu.(input_seq), cpu(init_α))
 
 ### Working with Flux layers
 
-CRFs smoothly work over Flux layers-
-
-```julia
-julia> NUM_FEATURES = 20
-
-julia> input_seq = [rand(NUM_FEATURES) for i in 1:SEQUENCE_LENGTH]
-2-element Vector{Vector{Float32}}:
- [0.948219, 0.719964, 0.352734, 0.0677656, 0.570564, 0.187673, 0.525125, 0.787807, 0.262452, 0.472472, 0.573259, 0.643369, 0.00592054, 0.945258, 0.951466, 0.323156, 0.679573, 0.663285, 0.218595, 0.152846]
- [0.433295, 0.11998, 0.99615, 0.530107, 0.188887, 0.897213, 0.993726, 0.0799431, 0.953333, 0.941808, 0.982638, 0.0919345, 0.27504, 0.894169, 0.66818, 0.449537, 0.93063, 0.384957, 0.415114, 0.212203]
-
-julia> m1 = Dense(NUM_FEATURES, NUM_LABELS + 2)
-
-julia> loss1(input_seq, label_seq) = crf_loss(c, m1.(input_seq), label_seq, init_α) # loss for model m1
+CRFs work smoothly with Flux layers:
 
-julia> loss1(input_seq,  [onehot(1, 1:2), onehot(1, 1:2)])
-4.6620379898687485
+```@example crf
+using Flux: Dense
+NUM_FEATURES = 20
 
+# For working with Dense layers, we can use 1D vectors
+input_seq_dense = [rand(NUM_FEATURES) for i in 1:SEQUENCE_LENGTH]
 ```
 
+```@example crf
+m1 = Dense(NUM_FEATURES, NUM_LABELS + 2)
+loss1(input_seq, label_seq) = crf_loss(c, m1.(input_seq), label_seq, init_α) # loss for model m1
+loss1(input_seq_dense,  [onehot(1, 1:2), onehot(1, 1:2), onehot(1, 1:2)])
+```
 
-Here is an example of CRF with LSTM and Dense layer -
-
-```julia
-julia> LSTM_SIZE = 10
-
-julia> lstm = LSTM(NUM_FEATURES, LSTM_SIZE)
 
-julia> dense_out = Dense(LSTM_SIZE, NUM_LABELS + 2)
+Here is an example of CRF with recurrent neural network layers:
 
-julia> m2(x) = dense_out.(lstm.(x))
+```@example crf
+# For recurrent layers, we need 2D input matrices (features × sequence_position)
+# Let's create properly formatted 2D data
+input_2d = [Float32.(rand(2, 1)) for i in 1:SEQUENCE_LENGTH]  # 2 features, 1 time step each
+input_2d
+```
 
-julia> loss2(input_seq, label_seq) = crf_loss(c, m2(input_seq), label_seq, init_α) # loss for model m2
+```@example crf
+using Flux: RNN
+# Create a simple RNN model that works with 2D input
+rnn_model = RNN(2 => 5)  # 2 input features → 5 hidden units
+dense_layer = Dense(5, NUM_LABELS + 2)  # 5 hidden → 4 output (NUM_LABELS + 2)
 
-julia> loss2(input_seq,  [onehot(1, 1:2), onehot(1, 1:2)])
-1.6501050910529504
+# Forward pass through RNN then Dense layer
+rnn_outputs = rnn_model.(input_2d)
+final_outputs = dense_layer.(rnn_outputs)
 
-julia> reset!(lstm)
+# Now we can use this with CRF
+loss_rnn(input_2d, label_seq) = crf_loss(c, dense_layer.(rnn_model.(input_2d)), label_seq, init_α)
+loss_rnn(input_2d, [onehot(1, 1:2), onehot(2, 1:2), onehot(1, 1:2)])
 ```
diff --git a/docs/src/index.md b/docs/src/index.md
index 8c36217..e4badd3 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -2,7 +2,7 @@
 
 The TextModels package enhances the TextAnalysis package with end-user focussed, practical natural language models, typically based on neural networks (in this case, [Flux](https://fluxml.ai/))
 
-This package depends on the [TextAnalysis](https://github.com/JuliaText/TextAnalysis.jl) package, which contains basic algorithms to deal with textual documetns. 
+This package depends on the [TextAnalysis](https://github.com/JuliaText/TextAnalysis.jl) package, which contains basic algorithms to deal with textual documents. 
 
 ## Installation
 
diff --git a/docs/src/ner.md b/docs/src/ner.md
index ec38bbd..240ad53 100644
--- a/docs/src/ner.md
+++ b/docs/src/ner.md
@@ -1,7 +1,7 @@
 # Named Entity Recognition
 
 The API provided is a pretrained model for tagging Named Entities.
-The current model support 4 types of Named Entities -
+The current model supports 4 types of Named Entities:
 
 - `PER`: Person
 - `LOC`: Location
@@ -9,8 +9,8 @@ The current model support 4 types of Named Entities -
 - `MISC`: Miscellaneous
 - `O`: Not a Named Entity
 
-To use the API, we first load the model weights into an instance of tagger.
-The function also accepts the path of model_weights and model_dicts (for character and word embeddings)
+To use the API, we first load the model weights into an instance of the tagger.
+The function also accepts the path of model_weights and model_dicts (for character and word embeddings):
 
     NERTagger()
     NERTagger(dicts_path, weights_path)
@@ -19,12 +19,12 @@ The function also accepts the path of model_weights and model_dicts (for charact
 julia> ner = NERTagger()
 ```
 !!! note
-    When you call `NERTagger()` for the first time, the package will request permission for download the `Model_dicts` and `Model_weights`. Upon downloading, these are store locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again.
+    When you call `NERTagger()` for the first time, the package will request permission to download the `Model_dicts` and `Model_weights`. Upon downloading, these are stored locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again.
 
 Once we create an instance, we can call it to tag a String (sentence), sequence of tokens, `AbstractDocument` or `Corpus`.
 
     (ner::NERTagger)(sentence::String)
-    (ner::NERTagger)(tokens::Array{String, 1})
+    (ner::NERTagger)(tokens::Vector{String})
     (ner::NERTagger)(sd::StringDocument)
     (ner::NERTagger)(fd::FileDocument)
     (ner::NERTagger)(td::TokenDocument)
@@ -35,7 +35,7 @@ julia> sentence = "This package is maintained by John Doe."
 "This package is maintained by John Doe."
 
 julia> tags = ner(sentence)
-8-element Array{String,1}:
+8-element Vector{String}:
  "O"
  "O"
  "O"
@@ -47,13 +47,13 @@ julia> tags = ner(sentence)
 
 ```
 
-The API tokenizes the input sentences via the default tokenizer provided by `WordTokenizers`, this currently being set to the multilingual `TokTok Tokenizer.`
+The API tokenizes the input sentences via the default tokenizer provided by `WordTokenizers`, this currently being set to the multilingual `TokTok Tokenizer`.
 
 ```
 julia> using WordTokenizers
 
 julia> collect(zip(WordTokenizers.tokenize(sentence), tags))
-8-element Array{Tuple{String,String},1}:
+8-element Vector{Tuple{String,String}}:
  ("This", "O")
  ("package", "O")
  ("is", "O")
@@ -65,59 +65,59 @@ julia> collect(zip(WordTokenizers.tokenize(sentence), tags))
 
 ```
 
-For tagging a multisentence text or document, once can use `split_sentences` from `WordTokenizers.jl` package and run the ner model on each.
+For tagging a multi-sentence text or document, one can use `split_sentences` from `WordTokenizers.jl` package and run the NER model on each.
 
 ```julia
-julia> sentences = "Rabinov is winding up his term as ambassador. He will be replaced by Eliahu Ben-Elissar, a former Israeli envoy to Egypt and right-wing Likud party politiian." # Sentence taken from CoNLL 2003 Dataset
+julia> sentences = "Rabinov is winding up his term as ambassador. He will be replaced by Eliahu Ben-Elissar, a former Israeli envoy to Egypt and right-wing Likud party politician." # Sentence taken from CoNLL 2003 Dataset
 
 julia> splitted_sents = WordTokenizers.split_sentences(sentences)
 
 julia> tag_sequences = ner.(splitted_sents)
-2-element Array{Array{String,1},1}:
- ["PER", "O", "O", "O", "O", "O", "O", "O", "O"]
+2-element Vector{Vector{String}}:
+ ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
  ["O", "O", "O", "O", "O", "PER", "PER", "O", "O", "O", "MISC", "O", "O", "LOC", "O", "O", "ORG", "ORG", "O", "O"]
 
-julia> zipped = [collect(zip(tag_sequences[i], WordTokenizers.tokenize(splitted_sents[i]))) for i in eachindex(splitted_sents)]
+julia> zipped = [collect(zip(WordTokenizers.tokenize(splitted_sents[i]), tag_sequences[i])) for i in eachindex(splitted_sents)]
 
 julia> zipped[1]
-9-element Array{Tuple{String,String},1}:
- ("PER", "Rabinov")
- ("O", "is")
- ("O", "winding")
- ("O", "up")
- ("O", "his")
- ("O", "term")
- ("O", "as")
- ("O", "ambassador")
- ("O", ".")
+9-element Vector{Tuple{String,String}}:
+ ("Rabinov", "O")
+ ("is", "O")
+ ("winding", "O")
+ ("up", "O")
+ ("his", "O")
+ ("term", "O")
+ ("as", "O")
+ ("ambassador", "O")
+ (".", "O")
 
 julia> zipped[2]
-20-element Array{Tuple{String,String},1}:
- ("O", "He")
- ("O", "will")
- ("O", "be")
- ("O", "replaced")
- ("O", "by")
- ("PER", "Eliahu")
- ("PER", "Ben-Elissar")
- ("O", ",")
- ("O", "a")
- ("O", "former")
- ("MISC", "Israeli")
- ("O", "envoy")
- ("O", "to")
- ("LOC", "Egypt")
- ("O", "and")
- ("O", "right-wing")
- ("ORG", "Likud")
- ("ORG", "party")
- ("O", "politiian")
- ("O", ".")
+20-element Vector{Tuple{String,String}}:
+ ("He", "O")
+ ("will", "O")
+ ("be", "O")
+ ("replaced", "O")
+ ("by", "O")
+ ("Eliahu", "PER")
+ ("Ben-Elissar", "PER")
+ (",", "O")
+ ("a", "O")
+ ("former", "O")
+ ("Israeli", "MISC")
+ ("envoy", "O")
+ ("to", "O")
+ ("Egypt", "LOC")
+ ("and", "O")
+ ("right-wing", "O")
+ ("Likud", "ORG")
+ ("party", "ORG")
+ ("politician", "O")
+ (".", "O")
 ```
 
-Since the tagging the Named Entities is done on sentence level,
-the text of `AbstractDocument` is sentence_tokenized and then labelled for over sentence.
-However is not possible for `NGramDocument` as text cannot be recreated.
+Since tagging the Named Entities is done on sentence level,
+the text of `AbstractDocument` is sentence-tokenized and then labelled for each sentence.
+However, this is not possible for `NGramDocument` as text cannot be recreated.
 For `TokenDocument`, text is approximated for splitting into sentences, hence the following throws a warning when tagging the `Corpus`.
 
 ```julia
@@ -134,7 +134,7 @@ Corpus's index contains 0 tokens
 julia> ner(crps)
 ┌ Warning: TokenDocument's can only approximate the original text
 └ @ TextAnalysis ~/.julia/dev/TextAnalysis/src/document.jl:220
-2-element Array{Array{Array{String,1},1},1}:
+2-element Vector{Vector{Vector{String}}}:
  [["O", "O", "O", "O", "O", "O", "O", "O"]]
  [["O", "O", "LOC", "O"]]
 ```
diff --git a/docs/src/sentiment.md b/docs/src/sentiment.md
index 3c99075..20aec49 100644
--- a/docs/src/sentiment.md
+++ b/docs/src/sentiment.md
@@ -1,14 +1,14 @@
 ## Sentiment Analyzer
 
 It can be used to find the sentiment score (between 0 and 1) of a word, sentence or a Document.
-A trained model (using Flux) on IMDB word corpus with weights saved are used to calculate the sentiments.
+A trained model (using Flux) trained on the IMDB word corpus with saved weights is used to calculate the sentiments.
 
     model = SentimentAnalyzer()
     model(doc)
     model(doc, handle_unknown)
 
-*  doc              = Input Document for calculating document (AbstractDocument type)
-*  handle_unknown   = A function for handling unknown words. Should return an array (default (x)->[])
+*  `doc`              - Input Document for sentiment analysis (AbstractDocument type)
+*  `handle_unknown`   - A function for handling unknown words. Should return an array (default `(x)->[]`)
 
 ```julia
 julia> using TextAnalysis
@@ -25,7 +25,7 @@ A StringDocument{String}
  * Snippet: a very nice thing that everyone likes
 
 julia> m(d1)
-0.5183109f0
+0.5183109
 
 julia> d2 = StringDocument("a horrible thing that everyone hates")
 A StringDocument{String}
@@ -36,6 +36,6 @@ A StringDocument{String}
  * Snippet: a horrible thing that everyone hates
 
 julia> m(d2)
-0.47193584f0
+0.47193587
 
 ```
diff --git a/docs/src/tagging.md b/docs/src/tagging.md
index 7f20d49..ebb263d 100644
--- a/docs/src/tagging.md
+++ b/docs/src/tagging.md
@@ -1,19 +1,20 @@
-## Tagging_schemes
+## Tagging Schemes
 
 There are many tagging schemes used for sequence labelling.
-TextAnalysis currently offers functions for conversion between these tagging format.
+TextAnalysis currently offers functions for conversion between these tagging formats.
 
 *   BIO1
 *   BIO2
 *   BIOES
 
 ```julia
+julia> using TextAnalysis
 julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"]
 
 julia> tag_scheme!(tags, "BIO1", "BIOES")
 
 julia> tags
-8-element Array{String,1}:
+8-element Vector{String}:
  "S-LOC"
  "O"
  "S-PER"
@@ -26,15 +27,15 @@ julia> tags
 
 ## Parts of Speech Tagging
 
-This package provides with two different Part of Speech Tagger.
+This package provides two different Part of Speech Taggers.
 
 ## Average Perceptron Part of Speech Tagger
 
-This tagger can be used to find the POS tag of a word or token in a given sentence. It is a based on `Average Perceptron Algorithm`.
-The model can be trained from scratch and weights are saved in specified location.
-The pretrained model can also be loaded and can be used directly to predict tags.
+This tagger can be used to find the POS tag of a word or token in a given sentence. It is based on the `Average Perceptron Algorithm`.
+The model can be trained from scratch and weights are saved in a specified location.
+The pretrained model can also be loaded and used directly to predict tags.
 
-### To train model:
+### To Train Model:
 ```julia
 julia> tagger = TextModels.PerceptronTagger(false) #we can use tagger = TextModels.PerceptronTagger()
 julia> fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]])
@@ -45,62 +46,62 @@ iteration : 4
 iteration : 5
 ```
 
-### To load pretrained model:
+### To Load Pretrained Model:
 ```julia
 julia> tagger = TextModels.PerceptronTagger(true)
 loaded successfully
 PerceptronTagger(AveragePerceptron(Set(Any["JJS", "NNP_VBZ", "NN_NNS", "CC", "NNP_NNS", "EX", "NNP_TO", "VBD_DT", "LS", ("Council", "NNP")  …  "NNPS", "NNP_LS", "VB", "NNS_NN", "NNP_SYM", "VBZ", "VBZ_JJ", "UH", "SYM", "NNP_NN", "CD"]), Dict{Any,Any}("i+2 word wetlands"=>Dict{Any,Any}("NNS"=>0.0,"JJ"=>0.0,"NN"=>0.0),"i-1 tag+i word NNP basic"=>Dict{Any,Any}("JJ"=>0.0,"IN"=>0.0),"i-1 tag+i word DT chloride"=>Dict{Any,Any}("JJ"=>0.0,"NN"=>0.0),"i-1 tag+i word NN choo"=>Dict{Any,Any}("NNP"=>0.0,"NN"=>0.0),"i+1 word antarctica"=>Dict{Any,Any}("FW"=>0.0,"NN"=>0.0),"i-1 tag+i word -START- appendix"=>Dict{Any,Any}("NNP"=>0.0,"NNPS"=>0.0,"NN"=>0.0),"i-1 word wahoo"=>Dict{Any,Any}("JJ"=>0.0,"VBD"=>0.0),"i-1 tag+i word DT children's"=>Dict{Any,Any}("NNS"=>0.0,"NN"=>0.0),"i word dnipropetrovsk"=>Dict{Any,Any}("NNP"=>0.003,"NN"=>-0.003),"i suffix hla"=>Dict{Any,Any}("JJ"=>0.0,"NN"=>0.0)…), DefaultDict{Any,Any,Int64}(), DefaultDict{Any,Any,Int64}(), 1, ["-START-", "-START2-"]), Dict{Any,Any}("is"=>"VBZ","at"=>"IN","a"=>"DT","and"=>"CC","for"=>"IN","by"=>"IN","Retrieved"=>"VBN","was"=>"VBD","He"=>"PRP","in"=>"IN"…), Set(Any["JJS", "NNP_VBZ", "NN_NNS", "CC", "NNP_NNS", "EX", "NNP_TO", "VBD_DT", "LS", ("Council", "NNP")  …  "NNPS", "NNP_LS", "VB", "NNS_NN", "NNP_SYM", "VBZ", "VBZ_JJ", "UH", "SYM", "NNP_NN", "CD"]), ["-START-", "-START2-"], ["-END-", "-END2-"], Any[])
 ```
 
-### To predict tags:
+### To Predict Tags:
 
-The perceptron tagger can predict tags over various document types-
+The perceptron tagger can predict tags over various document types:
 
     predict(tagger, sentence::String)
-    predict(tagger, Tokens::Array{String, 1})
+    predict(tagger, Tokens::Vector{String})
     predict(tagger, sd::StringDocument)
     predict(tagger, fd::FileDocument)
     predict(tagger, td::TokenDocument)
 
-This can also be done by -
+This can also be done by:
     tagger(input)
 
 
 ```julia
 julia> predict(tagger, ["today", "is"])
-2-element Array{Any,1}:
+2-element Vector{Any}:
  ("today", "NN")
  ("is", "VBZ")
 
 julia> tagger(["today", "is"])
-2-element Array{Any,1}:
+2-element Vector{Any}:
  ("today", "NN")
  ("is", "VBZ")
 ```
 
 `PerceptronTagger(load::Bool)`
 
-* load      = Boolean argument if `true` then pretrained model is loaded
+* `load`      - Boolean argument; if `true` then pretrained model is loaded
 
 `fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String, nr_iter::Integer)`
 
-* self      = `PerceptronTagger` object
-* sentences = `Vector` of `Vector` of `Tuple` of pair of word or token and its POS tag [see above example]
-* save_loc  = location of file to save the trained weights
-* nr_iter   = Number of iterations to pass the `sentences` to train the model ( default 5)
+* `self`      - `PerceptronTagger` object
+* `sentences` - `Vector` of `Vector` of `Tuple` of pairs of word or token and its POS tag [see above example]
+* `save_loc`  - location of file to save the trained weights
+* `nr_iter`   - Number of iterations to pass the `sentences` to train the model (default 5)
 
 `predict(self::PerceptronTagger, tokens)`
 
-* self      = PerceptronTagger
-* tokens    = `Vector` of words or tokens for which to predict tags
+* `self`      - PerceptronTagger
+* `tokens`    - `Vector` of words or tokens for which to predict tags
 
-## Neural Model for Part of Speech tagging using LSTMs, CNN and CRF
+## Neural Model for Part of Speech Tagging Using LSTMs, CNN and CRF
 
 The API provided is a pretrained model for tagging Part of Speech.
-The current model tags all the POS Tagging is done based on [convention used in Penn Treebank](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html), with 36 different Part of Speech tags excludes punctuation.
+The current model tags all POS based on the [convention used in Penn Treebank](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html), with 36 different Part of Speech tags excluding punctuation.
 
-To use the API, we first load the model weights into an instance of tagger.
-The function also accepts the path of model_weights and model_dicts (for character and word embeddings)
+To use the API, we first load the model weights into an instance of the tagger.
+The function also accepts the path of model_weights and model_dicts (for character and word embeddings):
 
     PoSTagger()
     PoSTagger(dicts_path, weights_path)
@@ -111,12 +112,12 @@ julia> pos = PoSTagger()
 ```
 
 !!! note
-    When you call `PoSTagger()` for the first time, the package will request permission for download the `Model_dicts` and `Model_weights`. Upon downloading, these are store locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again.
+    When you call `PoSTagger()` for the first time, the package will request permission to download the `Model_dicts` and `Model_weights`. Upon downloading, these are stored locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again.
 
 Once we create an instance, we can call it to tag a String (sentence), sequence of tokens, `AbstractDocument` or `Corpus`.
 
     (pos::PoSTagger)(sentence::String)
-    (pos::PoSTagger)(tokens::Array{String, 1})
+    (pos::PoSTagger)(tokens::Vector{String})
     (pos::PoSTagger)(sd::StringDocument)
     (pos::PoSTagger)(fd::FileDocument)
     (pos::PoSTagger)(td::TokenDocument)
@@ -128,7 +129,7 @@ julia> sentence = "This package is maintained by John Doe."
 "This package is maintained by John Doe."
 
 julia> tags = pos(sentence)
-8-element Array{String,1}:
+8-element Vector{String}:
  "DT"
  "NN"
  "VBZ"
@@ -140,14 +141,14 @@ julia> tags = pos(sentence)
 
 ```
 
-The API tokenizes the input sentences via the default tokenizer provided by `WordTokenizers`, this currently being set to the multilingual `TokTok Tokenizer.`
+The API tokenizes the input sentences via the default tokenizer provided by `WordTokenizers`, this currently being set to the multilingual `TokTok Tokenizer`.
 
 ```
 
 julia> using WordTokenizers
 
 julia> collect(zip(WordTokenizers.tokenize(sentence), tags))
-8-element Array{Tuple{String,String},1}:
+8-element Vector{Tuple{String,String}}:
  ("This", "DT")
  ("package", "NN")
  ("is", "VBZ")
@@ -159,60 +160,60 @@ julia> collect(zip(WordTokenizers.tokenize(sentence), tags))
 
 ```
 
-For tagging a multisentence text or document, once can use `split_sentences` from `WordTokenizers.jl` package and run the pos model on each.
+For tagging a multi-sentence text or document, one can use `split_sentences` from `WordTokenizers.jl` package and run the POS model on each.
 
 ```julia
-julia> sentences = "Rabinov is winding up his term as ambassador. He will be replaced by Eliahu Ben-Elissar, a former Israeli envoy to Egypt and right-wing Likud party politiian." # Sentence taken from CoNLL 2003 Dataset
+julia> sentences = "Rabinov is winding up his term as ambassador. He will be replaced by Eliahu Ben-Elissar, a former Israeli envoy to Egypt and right-wing Likud party politician." # Sentence taken from CoNLL 2003 Dataset
 
 julia> splitted_sents = WordTokenizers.split_sentences(sentences)
 
 julia> tag_sequences = pos.(splitted_sents)
-2-element Array{Array{String,1},1}:
- ["NNP", "VBZ", "VBG", "RP", "PRP\$", "NN", "IN", "NN", "."]
+2-element Vector{Vector{String}}:
+ ["CD", "VBZ", "VBG", "RP", "PRP\$", "NN", "IN", "NN", "."]
  ["PRP", "MD", "VB", "VBN", "IN", "NNP", "NNP", ",", "DT", "JJ", "JJ", "NN", "TO", "NNP", "CC", "JJ", "NNP", "NNP", "NNP", "."]
 
-julia> zipped = [collect(zip(tag_sequences[i], WordTokenizers.tokenize(splitted_sents[i]))) for i in eachindex(splitted_sents)]
+julia> zipped = [collect(zip(WordTokenizers.tokenize(splitted_sents[i]), tag_sequences[i])) for i in eachindex(splitted_sents)]
 
 julia> zipped[1]
-9-element Array{Tuple{String,String},1}:
- ("NNP", "Rabinov")
- ("VBZ", "is")
- ("VBG", "winding")
- ("RP", "up")
- ("PRP\$", "his")
- ("NN", "term")
- ("IN", "as")
- ("NN", "ambassador")
+9-element Vector{Tuple{String,String}}:
+ ("Rabinov", "CD")
+ ("is", "VBZ")
+ ("winding", "VBG")
+ ("up", "RP")
+ ("his", "PRP\$")
+ ("term", "NN")
+ ("as", "IN")
+ ("ambassador", "NN")
  (".", ".")
 
 julia> zipped[2]
-20-element Array{Tuple{String,String},1}:
- ("PRP", "He")
- ("MD", "will")
- ("VB", "be")
- ("VBN", "replaced")
- ("IN", "by")
- ("NNP", "Eliahu")
- ("NNP", "Ben-Elissar")
+20-element Vector{Tuple{String,String}}:
+ ("He", "PRP")
+ ("will", "MD")
+ ("be", "VB")
+ ("replaced", "VBN")
+ ("by", "IN")
+ ("Eliahu", "NNP")
+ ("Ben-Elissar", "NNP")
  (",", ",")
- ("DT", "a")
- ("JJ", "former")
- ("JJ", "Israeli")
- ("NN", "envoy")
- ("TO", "to")
- ("NNP", "Egypt")
- ("CC", "and")
- ("JJ", "right-wing")
- ("NNP", "Likud")
- ("NNP", "party")
- ("NNP", "politiian")
+ ("a", "DT")
+ ("former", "JJ")
+ ("Israeli", "JJ")
+ ("envoy", "NN")
+ ("to", "TO")
+ ("Egypt", "NNP")
+ ("and", "CC")
+ ("right-wing", "JJ")
+ ("Likud", "NNP")
+ ("party", "NNP")
+ ("politician", "NNP")
  (".", ".")
 
 ```
 
-Since the tagging the Part of Speech is done on sentence level,
-the text of `AbstractDocument` is sentence_tokenized and then labelled for over sentence.
-However is not possible for `NGramDocument` as text cannot be recreated.
+Since Part of Speech tagging is done on sentence level,
+the text of `AbstractDocument` is sentence-tokenized and then labelled for each sentence.
+However, this is not possible for `NGramDocument` as text cannot be recreated.
 For `TokenDocument`, text is approximated for splitting into sentences, hence the following throws a warning when tagging the `Corpus`.
 
 ```julia
@@ -230,7 +231,7 @@ Corpus's index contains 0 tokens
 julia> pos(crps)
 ┌ Warning: TokenDocument's can only approximate the original text
 └ @ TextAnalysis ~/.julia/dev/TextAnalysis/src/document.jl:220
-2-element Array{Array{Array{String,1},1},1}:
+2-element Vector{Vector{Vector{String}}}:
  [["PRP", "VBP", "RB", "JJ", "TO", "DT", "NN", "."]]
  [["DT", "VBZ", "NNP", "."]]
 
diff --git a/src/CRF/loss.jl b/src/CRF/loss.jl
index 77baf70..2be59c4 100644
--- a/src/CRF/loss.jl
+++ b/src/CRF/loss.jl
@@ -1,8 +1,19 @@
 """
-    forward_score(c::CRF, x::Array)
+    forward_score(c::CRF, x, init_α)
 
-Compute the Normalization / partition function
-or the Forward Algorithm score - `Z`
+Compute the normalization/partition function (Forward Algorithm score) `Z`.
+
+This function calculates the total score of all possible label sequences
+for the given input sequence, which is used as the denominator in the 
+CRF probability computation.
+
+# Arguments
+- `c::CRF`: The CRF model
+- `x`: Input sequence (emission scores)
+- `init_α`: Initial alpha values for the forward algorithm
+
+# Returns
+- The log of the partition function `Z`
 """
 function forward_score(c::CRF, x, init_α)
     forward_var = log_sum_exp(c.W .+ x[1]' .+ init_α)
@@ -15,14 +26,20 @@ function forward_score(c::CRF, x, init_α)
 end
 
 """
-    score_sequence(c::CRF, xs, label_seq)
+    score_sequence(c::CRF, x, label_seq)
 
-Calculating the score of the desired `label_seq` against sequence `xs`.
-Not exponentiated as required for negative log likelihood,
-thereby preventing operation.
+Calculate the score of the desired `label_seq` against sequence `x`.
 
-`label_seq`<:Array/ CuArray
-eltype(label_seq) = Flux.OneHotVector
+This function computes the unnormalized score (not exponentiated) for a given 
+label sequence, which is required for the negative log likelihood calculation.
+
+# Arguments
+- `c::CRF`: The CRF model
+- `x`: Input sequence (emission scores)
+- `label_seq`: Target label sequence where `eltype(label_seq) = Flux.OneHotVector`
+
+# Returns
+- Score value for the given label sequence
 """
 function score_sequence(c::CRF, x, label_seq)
     score = preds_first(c, label_seq[1]) + x[1][onecold(label_seq[1])]
@@ -35,5 +52,28 @@ function score_sequence(c::CRF, x, label_seq)
     return score + preds_last(c, label_seq[end])
 end
 
-# REGULARIZATION TERM
+"""
+    crf_loss(c::CRF, x, label_seq, init_α)
+
+Compute the negative log-likelihood loss for CRF training.
+
+This function calculates the CRF loss as the difference between the forward score
+(log partition function) and the score of the true label sequence. This represents
+the negative log-likelihood of the true sequence under the CRF model.
+
+# Arguments
+- `c::CRF`: The CRF model
+- `x`: Input sequence (emission scores)
+- `label_seq`: True label sequence
+- `init_α`: Initial alpha values for the forward algorithm
+
+# Returns
+- The negative log-likelihood loss value
+
+# Formula
+```
+loss = forward_score(c, x, init_α) - score_sequence(c, x, label_seq)
+     = log(Z) - score(true_sequence)
+```
+"""
 crf_loss(c::CRF, x, label_seq, init_α) = forward_score(c, x, init_α) - score_sequence(c, x, label_seq)
diff --git a/src/averagePerceptronTagger.jl b/src/averagePerceptronTagger.jl
index 571c52c..66fc843 100644
--- a/src/averagePerceptronTagger.jl
+++ b/src/averagePerceptronTagger.jl
@@ -1,374 +1,378 @@
-using DataStructures
-using Random
-using BSON
-using DataDeps
-
-export fit!, predict
-
-function pos_tagger_datadep_register()
-    register(DataDep("POS Perceptron Tagger Weights",
-        """
-        The trained weights for the average Perceptron Tagger on Part of Speech Tagging task.
-        """,
-        "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/pretrainedMod.bson.zip",
-        "52519cb3aea5d8f74368faedea831471e5df34567de4748d15decea7424743d3",
-        post_fetch_method = function(fn)
-            unpack(fn)
-            rm("__MACOSX", recursive=true)
-            file = readdir()[1]
-            mv(file, "POSWeights.bson")
-        end
-    ))
-end
-
-"""
-This file contains the Average Perceptron model and Perceptron Tagger
-which was original implemented by Matthew Honnibal.
-
-The model learns by basic perceptron algorithm
-but after all iterations weights are being averaged
-
-AVERAGE PERCEPTRON MODEL
-
-This struct contains the actual Average Perceptron Model
-"""
-mutable struct AveragePerceptron
-    classes :: Set
-    weights :: Dict
-    _totals :: DefaultDict
-    _tstamps :: DefaultDict
-    i :: Int64
-    START :: Array
-
-    function AveragePerceptron()
-        new(Set(), Dict(), DefaultDict(0), DefaultDict(0), 1, ["-START-", "-START2-"])
-    end
-end
-
-"""
-Predicting the class using current weights by doing Dot-product of features
-and weights and return the scores
-"""
-function predict(self::AveragePerceptron, features)
-    scores = DefaultDict(0.0)
-    for (feature, value) in features
-        if feature ∉ keys(self.weights)
-            continue
-        end
-        weights = self.weights[feature]
-        for (label, weight) in weights
-            scores[label] += value * weight
-        end
-    end
-    function custmax(scores)
-        a = [scores[class] for class in self.classes]
-        zipped = collect(zip(a, self.classes))
-        currmax = zipped[1]
-        for i=2:(length(zipped))
-            currmax = max(currmax, zipped[i])
-        end
-        return currmax[2]
-    end
-    return custmax(scores)
-end
-
-"""
-Applying the perceptron learning algorithm
-Increment the truth weights and decrementing the guess weights,
-if the guess is wrong
-"""
-function update(self::AveragePerceptron, truth, guess, features)
-    function upd_feat(c, f, w, v)
-        param = (f, c)
-        n_iters_at_this_weight = self.i - self._tstamps[param]
-        self._totals[param] += n_iters_at_this_weight * w
-        self.weights[f][c] = w + v
-        self._tstamps[param] = self.i
-    end
-
-    self.i += 1
-    if truth === guess
-        return nothing
-    end
-    for (f, value) in features
-        if f in keys(self.weights)
-            weights = self.weights[f]
-        else
-            self.weights[f] = Dict()
-            weights = Dict()
-        end
-        upd_feat(truth, f, get(weights, truth, 0.0), 1.0)
-        upd_feat(guess, f, get(weights, guess, 0.0), -1.0)
-    end
-    return nothing
-end
-
-"""
-Averaging the weights over all time stamps
-"""
-function average_weights(self::AveragePerceptron)
-    function newRound(fl, in)
-        temp = fl*(10^in)
-        return (float(round(temp))/(10^in))
-    end
-    for (feature , weights) in self.weights
-        new_feat_weights = Dict()
-        for (clas, weight) in weights
-            param = (feature, clas)
-            total = self._totals[param]
-            total += (self.i - self._tstamps[param])*weight
-            averaged = newRound(total/float(self.i-1), 3)
-            if averaged != nothing
-                new_feat_weights[clas] = averaged
-            end
-        self.weights[feature] = new_feat_weights
-        end
-    end
-    return nothing
-end
-
-"""
-# PERCEPTRON TAGGER
-
-This struct contains the POS tagger "PerceptronTagger" which uses model in "AveragePerceptron"
-In this training can be done and weights can be saved
-Or a pretrain weights can be used (which are trained on same features)
-and train more or can be used to predict
-
-## To train:
-
-```julia
-julia> tagger = PerceptronTagger(false)
-
-julia> fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]])
-```
-
-## To load pretrain model:
-
-```julia
-julia> tagger = PerceptronTagger(true)
-```
-
-## To predict tag:
-
-```julia
-julia> predict(tagger, ["today", "is"])
-```
-"""
-mutable struct PerceptronTagger
-    model :: AveragePerceptron
-    tagdict :: Dict
-    classes :: Set
-    START :: Array
-    END :: Array
-    _sentences
-
-        PerceptronTagger() = new(AveragePerceptron(), Dict(), Set(), ["-START-", "-START2-"], ["-END-", "-END2-"], [])
-end
-
-function PerceptronTagger(load::Bool)
-    self = PerceptronTagger()
-
-    # If load is true then a pretrain model will be import from location
-    if load
-        location = joinpath(datadep"POS Perceptron Tagger Weights", "POSWeights.bson")
-        pretrained = BSON.load(location)
-        self.model.weights = pretrained[:weights]
-        self.tagdict = pretrained[:tagdict]
-        self.classes = self.model.classes = Set(pretrained[:classes])
-        println("loaded successfully")
-    end
-
-    return self
-end
-
-"""
-makes a dictionary for single-tag words
-params : sentences - an array of tuples which contains word and correspinding tag
-"""
-function makeTagDict(self::PerceptronTagger, sentences)
-    counts = DefaultDict(()->DefaultDict(0))
-    for sentence in sentences
-        append!(self._sentences, sentences)
-        for (word, tag) in sentence
-            counts[word][tag] += 1
-            push!(self.classes, tag)
-        end
-    end
-    freq_thresh = 20
-    ambiguity_thresh = 0.97
-    for (word, tag_freqs) in counts
-        mode, tag = findmax(collect(values(tag_freqs))); tag = collect(keys(tag_freqs))[tag]
-        n = sum(values(tag_freqs))
-        if (n >= freq_thresh) && ((mode/n) >= ambiguity_thresh)
-            self.tagdict[word] = tag
-        end
-    end
-end
-
-"""
-This function is used to normalize the given word
-params : word - String
-"""
-function normalize(word)
-    word = string(word)
-    if occursin("-", word) && (word[1] != "-")
-        return "!HYPHEN"
-    elseif occursin(r"^[\d]{4}$", word)
-        return "!YEAR"
-    elseif occursin(r"^[\d]$", string(word[1]))
-        return "!DIGITS"
-    else
-        return lowercase(word)
-    end
-end
-
-"""
-Converting the token into a feature representation, implemented as Dict
-If the features change, a new model should be trained
-
-# Arguments:
-
-- `i` - index of word(or token) in sentence
-- `word` - token
-- `context` - array of tokens with starting and ending specifiers
-- `prev` == "-START-" prev2 == "-START2-" - Start specifiers
-"""
-function getFeatures(self::PerceptronTagger, i, word, context, prev, prev2)
-    function add(sep, name, args...)
-        str = name
-        for arg in args
-            str *= sep * arg
-        end
-        if str in keys(features)
-            features[str] += 1
-        else
-            features[str] = 1
-        end
-        return nothing
-    end
-    i += length(self.START)
-    features = OrderedDict()
-    add(" ", "bias")
-    if length(word) >= 3
-        add(" ", "i suffix", word[end-2:end])
-    else
-        add(" ", "i suffix", word)
-    end
-    add(" ", "i pref1", word[1])
-    add(" ", "i-1 tag", prev)
-    add(" ", "i-2 tag", prev2)
-    add(" ", "i tag+i-2 tag", prev, prev2)
-    add(" ", "i word", context[i])
-    add(" ", "i-1 tag+i word", prev, context[i])
-    add(" ", "i-1 word", context[i-1])
-    if length(context[i-1]) >= 3
-        add(" ", "i-1 suffix", context[i-1][end-2:end])
-    else
-        add(" ", "i-1 suffix", context[i-1])
-    end
-    add(" ", "i-2 word", context[i-2])
-    add(" ", "i+1 word", context[i+1])
-    if length(context[i+1]) >= 3
-        add(" ", "i+1 suffix", context[i+1][end-2:end])
-    else
-        add(" ", "i+1 suffix", context[i+1])
-    end
-    add(" ", "i+2 word", context[i+2])
-    return features
-end
-
-"""
-    predict(::PerceptronTagger, tokens)
-    predict(::PerceptronTagger, sentence)
-
-Used for predicting the tags for given sentence or array of tokens
-"""
-function predict(self::PerceptronTagger, tokens::Vector{String})
-    prev, prev2 = self.START
-    output = []
-
-    context = vcat(self.START, [normalize(word) for word in tokens], self.END)
-    for (i, word) in enumerate(tokens)
-        tag = get(self.tagdict, word, nothing)
-        if tag === nothing
-            features = getFeatures(self, i, word, context, prev, prev2)
-            tag = predict(self.model, features)
-        end
-        push!(output, (word, tag))
-        prev2 = prev
-        prev = tag
-    end
-    return output
-end
-
-function (tagger::PerceptronTagger)(input)
-    predict(tagger, input)
-end
-
-predict(tagger::PerceptronTagger, sentence::String) =
-        predict(tagger, WordTokenizers.tokenize(sentence))
-predict(tagger::PerceptronTagger, sd::StringDocument) =
-        predict(tagger, text(sd))
-predict(tagger::PerceptronTagger, fd::FileDocument) =
-        predict(tagger, text(fd))
-predict(tagger::PerceptronTagger, td::TokenDocument) =
-        predict(tagger, TextAnalysis.tokens(td))
-function predict(tagger::PerceptronTagger, ngd::NGramDocument)
-    @warn "POS tagging for NGramDocument not available."
-end
-
-
-
-"""
-    fit!(::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String, nr_iter::Integer)
-
-Used for training a new model or can be used for training
-an existing model by using pretrained weigths and classes
-
-Contains main training loop for number of epochs.
-After training weights, tagdict and classes are stored in the specified location.
-
-# Arguments:
-- `::PerceptronTagger` : Input PerceptronTagger model
-- `sentences::Vector{Vector{Tuple{String, String}}}` : Array of the all token seqeunces with target POS tag
-- `save_loc::String` : To specify the saving location
-- `nr_iter::Integer` : Total number of training iterations for given sentences(or number of epochs)
-"""
-function fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String, nr_iter::Integer)
-    self._sentences = []
-    makeTagDict(self, sentences)
-    self.model.classes = self.classes
-    for iter=1:nr_iter
-        c = 0; n = 0;
-        for sentence in self._sentences
-            words, tags = [x[1] for x in sentence], [x[2] for x in sentence]
-            prev, prev2 = self.START
-            context = vcat(self.START, [normalize(w) for w in words], self.END)
-            for (i, word) in enumerate(words)
-                guess = get(self.tagdict, word, nothing)
-                if guess == nothing
-                    feats = getFeatures(self, i, word, context, prev, prev2)
-                    guess = predict(self.model, feats)
-                    update(self.model, tags[i], guess, feats)
-                end
-                prev2 = prev
-                prev = guess
-                c += (guess == tags[i])
-                n += 1
-            end
-        end
-        shuffle(self._sentences)
-        println("iteration : $iter")
-    end
-    self._sentences = nothing
-    average_weights(self.model)
-
-    if save_loc != ""
-        bson(save_loc, weights = self.model.weights, tagdict = self.tagdict, classes = collect(self.classes))
-    end
-end
-
-fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, nr_iter::Integer) = fit!(self::PerceptronTagger, sentences, "", nr_iter)
-fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String) = fit!(self::PerceptronTagger, sentences, save_loc, 5)
-fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}) = fit!(self::PerceptronTagger, sentences, "", 5)
+using DataStructures
+using Random
+using BSON
+using DataDeps
+
+export fit!, predict
+
+function pos_tagger_datadep_register()
+    register(DataDep("POS Perceptron Tagger Weights",
+        """
+        The trained weights for the average Perceptron Tagger on Part of Speech Tagging task.
+        """,
+        "https://github.com/JuliaText/TextAnalysis.jl/releases/download/v0.6.0/pretrainedMod.bson.zip",
+        "52519cb3aea5d8f74368faedea831471e5df34567de4748d15decea7424743d3",
+        post_fetch_method = function(fn)
+            unpack(fn)
+            rm("__MACOSX", recursive=true)
+            file = readdir()[1]
+            mv(file, "POSWeights.bson")
+        end
+    ))
+end
+
+"""
+This file contains the Average Perceptron model and Perceptron Tagger
+which was original implemented by Matthew Honnibal.
+
+The model learns by basic perceptron algorithm
+but after all iterations weights are being averaged
+
+AVERAGE PERCEPTRON MODEL
+
+This struct contains the actual Average Perceptron Model
+"""
+mutable struct AveragePerceptron
+    classes :: Set
+    weights :: Dict
+    _totals :: DefaultDict
+    _tstamps :: DefaultDict
+    i :: Int64
+    START :: Array
+
+    function AveragePerceptron()
+        new(Set(), Dict(), DefaultDict(0), DefaultDict(0), 1, ["-START-", "-START2-"])
+    end
+end
+
+"""
+Predicting the class using current weights by doing Dot-product of features
+and weights and return the scores
+"""
+function predict(self::AveragePerceptron, features)
+    scores = DefaultDict(0.0)
+    for (feature, value) in features
+        if feature ∉ keys(self.weights)
+            continue
+        end
+        weights = self.weights[feature]
+        for (label, weight) in weights
+            scores[label] += value * weight
+        end
+    end
+    function custmax(scores)
+        a = [scores[class] for class in self.classes]
+        zipped = collect(zip(a, self.classes))
+        currmax = zipped[1]
+        for i=2:(length(zipped))
+            currmax = max(currmax, zipped[i])
+        end
+        return currmax[2]
+    end
+    return custmax(scores)
+end
+
+"""
+Applying the perceptron learning algorithm
+Increment the truth weights and decrementing the guess weights,
+if the guess is wrong
+"""
+function update(self::AveragePerceptron, truth, guess, features)
+    function upd_feat(c, f, w, v)
+        param = (f, c)
+        n_iters_at_this_weight = self.i - self._tstamps[param]
+        self._totals[param] += n_iters_at_this_weight * w
+        self.weights[f][c] = w + v
+        self._tstamps[param] = self.i
+    end
+
+    self.i += 1
+    if truth === guess
+        return nothing
+    end
+    for (f, value) in features
+        if f in keys(self.weights)
+            weights = self.weights[f]
+        else
+            self.weights[f] = Dict()
+            weights = Dict()
+        end
+        upd_feat(truth, f, get(weights, truth, 0.0), 1.0)
+        upd_feat(guess, f, get(weights, guess, 0.0), -1.0)
+    end
+    return nothing
+end
+
+"""
+Averaging the weights over all time stamps
+"""
+function average_weights(self::AveragePerceptron)
+    function newRound(fl, in)
+        temp = fl*(10^in)
+        return (float(round(temp))/(10^in))
+    end
+    for (feature , weights) in self.weights
+        new_feat_weights = Dict()
+        for (clas, weight) in weights
+            param = (feature, clas)
+            total = self._totals[param]
+            total += (self.i - self._tstamps[param])*weight
+            averaged = newRound(total/float(self.i-1), 3)
+            if averaged != nothing
+                new_feat_weights[clas] = averaged
+            end
+        self.weights[feature] = new_feat_weights
+        end
+    end
+    return nothing
+end
+
+"""
+# PERCEPTRON TAGGER
+
+This struct contains the POS tagger "PerceptronTagger" which uses model in "AveragePerceptron"
+In this training can be done and weights can be saved
+Or a pretrain weights can be used (which are trained on same features)
+and train more or can be used to predict
+
+## To train:
+
+```julia
+julia> using TextModels: PerceptronTagger, fit!
+julia> tagger = PerceptronTagger(false)
+julia> sentences = [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]]
+julia> fit!(tagger, sentences, 2)  # Train for 2 iterations
+```
+
+## To load pretrain model:
+
+```julia
+julia> using TextModels: PerceptronTagger
+julia> tagger = PerceptronTagger(true)
+```
+
+## To predict tag:
+
+```julia
+julia> using TextModels: PerceptronTagger, predict
+julia> tagger = PerceptronTagger(true)  # Load pretrained model
+julia> predict(tagger, ["today", "is"])
+```
+"""
+mutable struct PerceptronTagger
+    model :: AveragePerceptron
+    tagdict :: Dict
+    classes :: Set
+    START :: Array
+    END :: Array
+    _sentences
+
+        PerceptronTagger() = new(AveragePerceptron(), Dict(), Set(), ["-START-", "-START2-"], ["-END-", "-END2-"], [])
+end
+
+function PerceptronTagger(load::Bool)
+    self = PerceptronTagger()
+
+    # If load is true then a pretrain model will be import from location
+    if load
+        location = joinpath(datadep"POS Perceptron Tagger Weights", "POSWeights.bson")
+        pretrained = BSON.load(location)
+        self.model.weights = pretrained[:weights]
+        self.tagdict = pretrained[:tagdict]
+        self.classes = self.model.classes = Set(pretrained[:classes])
+        println("loaded successfully")
+    end
+
+    return self
+end
+
+"""
+makes a dictionary for single-tag words
+params : sentences - an array of tuples which contains word and correspinding tag
+"""
+function makeTagDict(self::PerceptronTagger, sentences)
+    counts = DefaultDict(()->DefaultDict(0))
+    for sentence in sentences
+        append!(self._sentences, sentences)
+        for (word, tag) in sentence
+            counts[word][tag] += 1
+            push!(self.classes, tag)
+        end
+    end
+    freq_thresh = 20
+    ambiguity_thresh = 0.97
+    for (word, tag_freqs) in counts
+        mode, tag = findmax(collect(values(tag_freqs))); tag = collect(keys(tag_freqs))[tag]
+        n = sum(values(tag_freqs))
+        if (n >= freq_thresh) && ((mode/n) >= ambiguity_thresh)
+            self.tagdict[word] = tag
+        end
+    end
+end
+
+"""
+This function is used to normalize the given word
+params : word - String
+"""
+function normalize(word)
+    word = string(word)
+    if occursin("-", word) && (word[1] != "-")
+        return "!HYPHEN"
+    elseif occursin(r"^[\d]{4}$", word)
+        return "!YEAR"
+    elseif occursin(r"^[\d]$", string(word[1]))
+        return "!DIGITS"
+    else
+        return lowercase(word)
+    end
+end
+
+"""
+Converting the token into a feature representation, implemented as Dict
+If the features change, a new model should be trained
+
+# Arguments:
+
+- `i` - index of word(or token) in sentence
+- `word` - token
+- `context` - array of tokens with starting and ending specifiers
+- `prev` == "-START-" prev2 == "-START2-" - Start specifiers
+"""
+function getFeatures(self::PerceptronTagger, i, word, context, prev, prev2)
+    function add(sep, name, args...)
+        str = name
+        for arg in args
+            str *= sep * arg
+        end
+        if str in keys(features)
+            features[str] += 1
+        else
+            features[str] = 1
+        end
+        return nothing
+    end
+    i += length(self.START)
+    features = OrderedDict()
+    add(" ", "bias")
+    if length(word) >= 3
+        add(" ", "i suffix", word[end-2:end])
+    else
+        add(" ", "i suffix", word)
+    end
+    add(" ", "i pref1", word[1])
+    add(" ", "i-1 tag", prev)
+    add(" ", "i-2 tag", prev2)
+    add(" ", "i tag+i-2 tag", prev, prev2)
+    add(" ", "i word", context[i])
+    add(" ", "i-1 tag+i word", prev, context[i])
+    add(" ", "i-1 word", context[i-1])
+    if length(context[i-1]) >= 3
+        add(" ", "i-1 suffix", context[i-1][end-2:end])
+    else
+        add(" ", "i-1 suffix", context[i-1])
+    end
+    add(" ", "i-2 word", context[i-2])
+    add(" ", "i+1 word", context[i+1])
+    if length(context[i+1]) >= 3
+        add(" ", "i+1 suffix", context[i+1][end-2:end])
+    else
+        add(" ", "i+1 suffix", context[i+1])
+    end
+    add(" ", "i+2 word", context[i+2])
+    return features
+end
+
+"""
+    predict(::PerceptronTagger, tokens)
+    predict(::PerceptronTagger, sentence)
+
+Used for predicting the tags for given sentence or array of tokens
+"""
+function predict(self::PerceptronTagger, tokens::Vector{String})
+    prev, prev2 = self.START
+    output = []
+
+    context = vcat(self.START, [normalize(word) for word in tokens], self.END)
+    for (i, word) in enumerate(tokens)
+        tag = get(self.tagdict, word, nothing)
+        if tag === nothing
+            features = getFeatures(self, i, word, context, prev, prev2)
+            tag = predict(self.model, features)
+        end
+        push!(output, (word, tag))
+        prev2 = prev
+        prev = tag
+    end
+    return output
+end
+
+function (tagger::PerceptronTagger)(input)
+    predict(tagger, input)
+end
+
+predict(tagger::PerceptronTagger, sentence::String) =
+        predict(tagger, WordTokenizers.tokenize(sentence))
+predict(tagger::PerceptronTagger, sd::StringDocument) =
+        predict(tagger, text(sd))
+predict(tagger::PerceptronTagger, fd::FileDocument) =
+        predict(tagger, text(fd))
+predict(tagger::PerceptronTagger, td::TokenDocument) =
+        predict(tagger, TextAnalysis.tokens(td))
+function predict(tagger::PerceptronTagger, ngd::NGramDocument)
+    @warn "POS tagging for NGramDocument not available."
+end
+
+
+
+"""
+    fit!(::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String, nr_iter::Integer)
+
+Used for training a new model or can be used for training
+an existing model by using pretrained weigths and classes
+
+Contains main training loop for number of epochs.
+After training weights, tagdict and classes are stored in the specified location.
+
+# Arguments:
+- `::PerceptronTagger` : Input PerceptronTagger model
+- `sentences::Vector{Vector{Tuple{String, String}}}` : Array of the all token seqeunces with target POS tag
+- `save_loc::String` : To specify the saving location
+- `nr_iter::Integer` : Total number of training iterations for given sentences(or number of epochs)
+"""
+function fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String, nr_iter::Integer)
+    self._sentences = []
+    makeTagDict(self, sentences)
+    self.model.classes = self.classes
+    for iter=1:nr_iter
+        c = 0; n = 0;
+        for sentence in self._sentences
+            words, tags = [x[1] for x in sentence], [x[2] for x in sentence]
+            prev, prev2 = self.START
+            context = vcat(self.START, [normalize(w) for w in words], self.END)
+            for (i, word) in enumerate(words)
+                guess = get(self.tagdict, word, nothing)
+                if guess == nothing
+                    feats = getFeatures(self, i, word, context, prev, prev2)
+                    guess = predict(self.model, feats)
+                    update(self.model, tags[i], guess, feats)
+                end
+                prev2 = prev
+                prev = guess
+                c += (guess == tags[i])
+                n += 1
+            end
+        end
+        shuffle(self._sentences)
+        println("iteration : $iter")
+    end
+    self._sentences = nothing
+    average_weights(self.model)
+
+    if save_loc != ""
+        bson(save_loc, weights = self.model.weights, tagdict = self.tagdict, classes = collect(self.classes))
+    end
+end
+
+fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, nr_iter::Integer) = fit!(self::PerceptronTagger, sentences, "", nr_iter)
+fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String) = fit!(self::PerceptronTagger, sentences, save_loc, 5)
+fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}) = fit!(self::PerceptronTagger, sentences, "", 5)
diff --git a/src/sequence/ner.jl b/src/sequence/ner.jl
index ca9503c..c582033 100644
--- a/src/sequence/ner.jl
+++ b/src/sequence/ner.jl
@@ -29,6 +29,34 @@ function load_model_dicts(filepath, remove_tag_prefix)
     return remove_ner_label_prefix.([labels...]), chars_idx, words_idx
 end
 
+"""
+    NERTagger()
+    NERTagger(dicts_path, weights_path)
+
+Creates a Named Entity Recognition tagger using a pretrained BiLSTM-CNN-CRF model.
+
+The model supports 4 types of Named Entities:
+- `PER`: Person
+- `LOC`: Location  
+- `ORG`: Organisation
+- `MISC`: Miscellaneous
+- `O`: Not a Named Entity
+
+# Arguments
+- `dicts_path::String`: Path to model dictionaries (character and word embeddings)
+- `weights_path::String`: Path to model weights
+
+# Usage
+```julia
+julia> ner = NERTagger()  # Downloads pretrained model on first use
+julia> ner("John works at Google in California")
+```
+
+!!! note
+    When calling `NERTagger()` for the first time, the package will download 
+    the model dictionaries and weights. These are stored locally and managed 
+    by DataDeps for subsequent uses.
+"""
 NERTagger() = NERTagger(datadep"NER Model Dicts", datadep"NER Model Weights")
 
 function NERTagger(dicts_path, weights_path)
diff --git a/src/sequence/pos.jl b/src/sequence/pos.jl
index b23c210..ba50ad4 100644
--- a/src/sequence/pos.jl
+++ b/src/sequence/pos.jl
@@ -7,6 +7,39 @@ struct PoSModel{M}
     model::M
 end
 
+"""
+    PoSTagger()
+    PoSTagger(dicts_path, weights_path)
+
+Creates a Part-of-Speech tagger using a pretrained BiLSTM-CNN-CRF model.
+
+The model performs POS tagging based on the Penn Treebank convention with 36 
+different Part-of-Speech tags (excluding punctuation).
+
+# Arguments
+- `dicts_path::String`: Path to model dictionaries (character and word embeddings)
+- `weights_path::String`: Path to model weights
+
+# Usage
+```julia
+julia> pos = PoSTagger()  # Downloads pretrained model on first use
+julia> pos("This package is maintained by John Doe.")
+8-element Vector{String}:
+ "DT"   # This
+ "NN"   # package
+ "VBZ"  # is
+ "VBN"  # maintained
+ "IN"   # by
+ "NNP"  # John
+ "NNP"  # Doe
+ "."    # .
+```
+
+!!! note
+    When calling `PoSTagger()` for the first time, the package will download 
+    the model dictionaries and weights. These are stored locally and managed 
+    by DataDeps for subsequent uses.
+"""
 PoSTagger() = PoSTagger(datadep"POS Model Dicts", datadep"POS Model Weights")
 
 function PoSTagger(dicts_path, weights_path)

From 3b96460612d35574221e5b8f62fa891a06d0d5a5 Mon Sep 17 00:00:00 2001
From: Roman S Samarev <rssdev10@gmail.com>
Date: Sat, 18 Oct 2025 23:01:21 +0300
Subject: [PATCH 4/4] bump version

---
 Project.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 14e7007..0a02b6b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -2,7 +2,7 @@ name = "TextModels"
 uuid = "77b9cbda-2a23-51df-82a3-24144d1cd378"
 license = "MIT"
 desc = "Practical Neural Network based models for Natural Language Processing"
-version = "0.2.0"
+version = "0.2.1"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
@@ -32,7 +32,7 @@ DataDeps = "0.7"
 DataStructures = "0.18, 0.19, 0.20"
 Flux = "0.16, 0.17"
 Functors = "0.4, 0.5, 0.6"
-JSON = "0.21, 0.22"
+JSON = "0.21, 1"
 Languages = "0.4"
 NNlib = "0.7, 0.8, 0.9, 0.10"
 StatsBase = "0.33, 0.34, 0.35"