From 767c7f65f36a8307f1c4bd001adac39011291229 Mon Sep 17 00:00:00 2001
From: Avik Sengupta <avik@sengupta.net>
Date: Sun, 8 Nov 2020 20:52:55 +0000
Subject: [PATCH 01/23] move ci to github actions

---
 .github/workflows/ci.yml | 61 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..b46410b
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,61 @@
+name: CI
+on:
+  push:
+    branches:
+      - master
+    tags: '*'
+  pull_request:
+jobs:
+  test:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
+    runs-on: ${{ matrix.os }}
+    continue-on-error: ${{ matrix.version == 'nightly' }}
+    strategy:
+      matrix:
+        version:
+          - '1.3'
+          - '1'
+          - 'nightly'
+        os:
+          - ubuntu-latest
+          - macOS-latest
+          - windows-latest
+        arch:
+          - x86
+          - x64
+        exclude:
+          # Remove some configurations from the build matrix to reduce CI time.
+          # See https://github.com/marketplace/actions/setup-julia-environment
+          # MacOS not available on x86
+          - {os: 'macOS-latest', arch: 'x86'}
+          # Don't test on all versions
+          - {os: 'macOS-latest', version: '1.3'}
+          - {os: 'macOS-latest', version: 'nightly'}
+          - {os: 'windows-latest', version: '1.3'}
+          - {os: 'windows-latest', version: 'nightly'}
+          - {os: 'windows-latest', arch: 'x86'}
+    steps:
+      - uses: actions/checkout@v1
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+      - uses: julia-actions/julia-buildpkg@latest
+      - uses: julia-actions/julia-runtest@latest
+        with:
+          coverage: false
+  docs:
+    name: Documentation
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v1
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: '1.5'
+      - run: julia --project=docs -e '
+          using Pkg;
+          Pkg.develop(PackageSpec(; path=pwd()));
+          Pkg.instantiate();'
+      - run: julia --project=docs docs/make.jl
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From ec410a3748a671d4a401857753d4c4350c5aa252 Mon Sep 17 00:00:00 2001
From: Avik Sengupta <avik@sengupta.net>
Date: Sun, 8 Nov 2020 20:59:27 +0000
Subject: [PATCH 02/23] set DATADEPS env

---
 .github/workflows/ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b46410b..70ce92e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -42,6 +42,8 @@ jobs:
           arch: ${{ matrix.arch }}
       - uses: julia-actions/julia-buildpkg@latest
       - uses: julia-actions/julia-runtest@latest
+        env:
+          DATADEPS_ALWAYS_ACCEPT: true
         with:
           coverage: false
   docs:

From 1c325730d66e4f95cb1abce52b9d6d0dc837134b Mon Sep 17 00:00:00 2001
From: Avik Sengupta <avik@sengupta.net>
Date: Sun, 8 Nov 2020 21:00:25 +0000
Subject: [PATCH 03/23] Goodbye Travis. Thanks for all the fish.

---
 .travis.yml | 30 ------------------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 .travis.yml

diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index bf028c9..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-language: julia
-os:
-  - linux
-  - osx
-  - windows
-env:
-  - DATADEPS_ALWAYS_ACCEPT=true
-julia:
-  - 1.3
-  - 1
-  - nightly
-matrix:
-  allow_failures:
-    - julia: nightly
-  exclude:
-    - os: osx
-      julia: 1.3
-    - os: windows
-      julia: 1.3
-    - os: osx
-      julia: nightly
-    - os: windows
-      julia: nightly
-  fast_finish: true
-branches:
-  only:
-    - master
-    - /release-.*/
-notifications:
-  email: false

From e6a7833c8c9643e214fc688142d55e5518f04af1 Mon Sep 17 00:00:00 2001
From: Avik Sengupta <avik@sengupta.net>
Date: Sun, 8 Nov 2020 21:00:49 +0000
Subject: [PATCH 04/23] docs typos

---
 docs/make.jl      | 2 +-
 docs/src/index.md | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index 5876f79..06b376f 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -2,7 +2,7 @@ using Documenter, TextModels
 
 makedocs(
     modules = [TextModels],
-    sitename = "TextAnalysis",
+    sitename = "TextModels",
     format = Documenter.HTML(
     ),
     pages = [
diff --git a/docs/src/index.md b/docs/src/index.md
index 2168e45..8c36217 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -2,6 +2,8 @@
 
 The TextModels package enhances the TextAnalysis package with end-user focussed, practical natural language models, typically based on neural networks (in this case, [Flux](https://fluxml.ai/))
 
+This package depends on the [TextAnalysis](https://github.com/JuliaText/TextAnalysis.jl) package, which contains basic algorithms to deal with textual documetns. 
+
 ## Installation
 
 The TextModels package can be installed using Julia's package manager:

From f2da616e0d049cfc4d51ea61c3fb9a1b91136175 Mon Sep 17 00:00:00 2001
From: Avik Sengupta <avik@sengupta.net>
Date: Tue, 17 Nov 2020 20:49:28 +0000
Subject: [PATCH 05/23] reduce testing on x86

---
 .github/workflows/ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 70ce92e..67b437d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -34,6 +34,8 @@ jobs:
           - {os: 'windows-latest', version: '1.3'}
           - {os: 'windows-latest', version: 'nightly'}
           - {os: 'windows-latest', arch: 'x86'}
+          - {arch: 'x86', version: '1.3'}
+          - {arch: 'x86', version: 'nightly'}
     steps:
       - uses: actions/checkout@v1
       - uses: julia-actions/setup-julia@latest

From 42a0e06a8440c9c0f32e8ed11ed9709bf7692ddd Mon Sep 17 00:00:00 2001
From: Adarshkumar712 <adarshkumar712.ak@gmail.com>
Date: Wed, 28 Apr 2021 22:18:55 -0700
Subject: [PATCH 06/23] Update crf, ner, pos

---
 Project.toml                    | 35 ++++++++++++---------------
 src/CRF/crf.jl                  |  4 ++--
 src/CRF/loss.jl                 |  4 ++--
 src/CRF/predict.jl              |  6 ++---
 src/TextModels.jl               | 42 ++++++++++++++++-----------------
 src/sequence/pos.jl             |  2 +-
 src/sequence/sequence_models.jl | 32 ++++++++++++-------------
 test/crf.jl                     | 17 ++++++-------
 test/runtests.jl                |  2 +-
 9 files changed, 70 insertions(+), 74 deletions(-)

diff --git a/Project.toml b/Project.toml
index 4687488..7d6b695 100644
--- a/Project.toml
+++ b/Project.toml
@@ -2,40 +2,35 @@ name = "TextModels"
 uuid = "77b9cbda-2a23-51df-82a3-24144d1cd378"
 license = "MIT"
 desc = "Practical Neural Network based models for Natural Language Processing"
-version = "0.1.0"
+version = "0.1.1"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
-Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-BSON = "0.2.5"
-DataDeps = "0.7"
-DataStructures = "0.17, 0.18"
-Flux = "0.9"
-JSON = "0.21"
-Languages = "0.4"
-NNlib = "0.6, 0.7"
-StatsBase = "0.33"
-TextAnalysis = "0.7"
-Tracker = "0.2"
-WordTokenizers = "0.5"
-julia = "1.3"
+BSON = "0.3.3"
+DataDeps = "0.7.7"
+DataStructures = "0.18.9"
+Flux = "0.12.2"
+JSON = "0.21.1"
+Languages = "0.4.3"
+NNlib = "0.7"
+StatsBase = "0.33.6"
+TextAnalysis = "0.7.3"
+WordTokenizers = "0.5.6"
+Zygote = "0.6.10"
+julia = "1.6"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/src/CRF/crf.jl b/src/CRF/crf.jl
index 98ffc23..3145d89 100644
--- a/src/CRF/crf.jl
+++ b/src/CRF/crf.jl
@@ -22,10 +22,10 @@ function CRF(n::Integer)
     W[:, n + 1] .= -10000
     W[n + 2, :] .= -10000
 
-    return CRF(param(W), n)
+    return CRF(W, n)
 end
 
-@treelike CRF
+@functor CRF
 
 function Base.show(io::IO, c::CRF)
     print(io, "CRF with ", c.n + 2, " distinct tags (including START and STOP tags).")
diff --git a/src/CRF/loss.jl b/src/CRF/loss.jl
index 495816d..c1405fd 100644
--- a/src/CRF/loss.jl
+++ b/src/CRF/loss.jl
@@ -5,13 +5,13 @@ Compute the Normalization / partition function
 or the Forward Algorithm score - `Z`
 """
 function forward_score(c::CRF, x, init_α)
-    forward_var = log_sum_exp((c.W .+ x[1]') .+ init_α)
+    forward_var = log_sum_exp((c.W .+ (x[1]') .+ init_α))
 
     for i in 2:length(x)
         forward_var = log_sum_exp((c.W .+ x[i]') .+ forward_var')
     end
 
-    return log_sum_exp(c.W[:, c.n + 2] + forward_var')[1]
+    return log_sum_exp(c.W[:, c.n + 2] .+ forward_var')[1]
 end
 
 """
diff --git a/src/CRF/predict.jl b/src/CRF/predict.jl
index 29e2c34..3225b70 100644
--- a/src/CRF/predict.jl
+++ b/src/CRF/predict.jl
@@ -35,14 +35,14 @@ Computes the forward pass for viterbi algorithm.
 function _decode(c::CRF, x, init_vit_vars)
     α_idx = zeros(Int, c.n + 2, length(x))
 
-    forward_var, α_idx[:, 1] = forward_pass_unit(Tracker.data((c.W .+ x[1]') .+ init_vit_vars))
+    forward_var, α_idx[:, 1] = forward_pass_unit((c.W .+ x[1]') .+ init_vit_vars)
 
     for i in 2:length(x)
-        forward_var, α_idx[:, i] = forward_pass_unit(Tracker.data((c.W .+ x[i]') .+ forward_var'))
+        forward_var, α_idx[:, i] = forward_pass_unit((c.W .+ x[i]') .+ forward_var')
     end
 
     labels = zeros(Int, length(x))
-    labels[end] = argmax(forward_var + Tracker.data(c.W[:, c.n + 2])')[2]
+    labels[end] = argmax(forward_var + (c.W[:, c.n + 2])')[2]
 
     for i in reverse(2:length(x))
         labels[i - 1] =  α_idx[labels[i], i]
diff --git a/src/TextModels.jl b/src/TextModels.jl
index a82ec68..79d6223 100644
--- a/src/TextModels.jl
+++ b/src/TextModels.jl
@@ -7,8 +7,8 @@ module TextModels
     using Pkg.Artifacts
 
 
-    using Flux, Tracker
-    using Flux: identity, onehot, onecold, @treelike, onehotbatch
+    using Flux, Zygote
+    using Flux: identity, onehot, onecold, @functor, onehotbatch
 
 
     using TextAnalysis
@@ -36,31 +36,31 @@ module TextModels
     include("sequence/pos_datadeps.jl")
     include("sequence/pos.jl")
     include("sequence/sequence_models.jl")
-    
-    
+     
+   
     # ULMFiT
-    module ULMFiT
-        using ..TextAnalysis
-        using DataDeps
-        using Flux
-        using Tracker
-        using BSON
-        include("ULMFiT/utils.jl")
-        include("ULMFiT/datadeps.jl")
-        include("ULMFiT/data_loaders.jl")
-        include("ULMFiT/custom_layers.jl")
-        include("ULMFiT/pretrain_lm.jl")
-        include("ULMFiT/fine_tune_lm.jl")
-        include("ULMFiT/train_text_classifier.jl")
-    end
-    export ULMFiT
+    #module ULMFiT
+    #    using ..TextAnalysis
+    #    using DataDeps
+    #    using Flux
+    #    using Tracker
+    #    using BSON
+    #    include("ULMFiT/utils.jl")
+    #    include("ULMFiT/datadeps.jl")
+    #    include("ULMFiT/data_loaders.jl")
+    #    include("ULMFiT/custom_layers.jl")
+    #    include("ULMFiT/pretrain_lm.jl")
+    #    include("ULMFiT/fine_tune_lm.jl")
+    #    include("ULMFiT/train_text_classifier.jl")
+    #end
+    #export ULMFiT
 
     function __init__()
         pos_tagger_datadep_register()
         ner_datadep_register()
         pos_datadep_register()
-        ULMFiT.ulmfit_datadep_register()
-
+        #ULMFiT.ulmfit_datadep_register()
+    
         global sentiment_model = artifact"sentiment_model"
     end
 end
diff --git a/src/sequence/pos.jl b/src/sequence/pos.jl
index 9346a3a..b23c210 100644
--- a/src/sequence/pos.jl
+++ b/src/sequence/pos.jl
@@ -1,4 +1,4 @@
-using BSON, Tracker
+using BSON
 
 const PoSCharUNK = '¿'
 const PoSWordUNK = "<UNK>"
diff --git a/src/sequence/sequence_models.jl b/src/sequence/sequence_models.jl
index b19e6a0..8c8a6df 100644
--- a/src/sequence/sequence_models.jl
+++ b/src/sequence/sequence_models.jl
@@ -1,4 +1,4 @@
-using BSON, Tracker
+using BSON
 mutable struct BiLSTM_CNN_CRF_Model{C, W, L, D, O, A}
     labels::Array{String, 1} # List of Labels
     chars_idx#::Dict{Char, Integer} # Dict that maps chars to indices in W_Char_Embed
@@ -33,32 +33,32 @@ function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Wor
     init_α[n + 1] = 0
 
     # Word and Character Embeddings.
-    W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu]
-    W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu]
+    W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu][:, 1:end-1]	# no padding char token here
+    W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu][:, 1:end-1]	# no padding word token here
 
     # Forward_LSTM
     forward_wts = BSON.load(joinpath(weights_path, "forward_lstm.bson"))
     forward_lstm = Flux.Recur(Flux.LSTMCell(forward_wts[:lstm_2], # Wi
                                             forward_wts[:lstm_1], # Wh
                                             forward_wts[:lstm_3], # b
-                                            forward_wts[:lstm_4], # h
-                                            forward_wts[:lstm_5]  # c
+                                            (reshape(forward_wts[:lstm_4], length(forward_wts[:lstm_4]), 1), # h
+                                            reshape(forward_wts[:lstm_5], length(forward_wts[:lstm_5]), 1))  # c
                                            ),
-                              forward_wts[:lstm_init],
-                              forward_wts[:lstm_state]
-                             )
+                                 (reshape(forward_wts[:lstm_state][1], length(forward_wts[:lstm_state][1]), 1), # h
+                                            reshape(forward_wts[:lstm_state][2], length(forward_wts[:lstm_state][2]), 1)) 
+                              )
 
     # Backward_LSTM
     backward_wts = BSON.load(joinpath(weights_path, "backward_lstm.bson"))
     backward = Flux.Recur(Flux.LSTMCell(backward_wts[:lstm_2], # Wi
                                              backward_wts[:lstm_1], # Wh
                                              backward_wts[:lstm_3], # b
-                                             backward_wts[:lstm_4], # h
-                                             backward_wts[:lstm_5]  # c
-                                            ),
-                               backward_wts[:lstm_init],
-                               backward_wts[:lstm_state]
-                              )
+                                             (reshape(backward_wts[:lstm_4], length(backward_wts[:lstm_4]), 1), # h
+                                            reshape(backward_wts[:lstm_5], length(backward_wts[:lstm_5]), 1))  # c
+                                           ),
+                                 (reshape(backward_wts[:lstm_state][1], length(backward_wts[:lstm_state][1]), 1), # h
+                                            reshape(backward_wts[:lstm_state][2], length(backward_wts[:lstm_state][2]), 1))                   
+                          )
 
     # Dense
     d_weights_bias = BSON.load(joinpath(weights_path, "d_cpu.bson"))
@@ -69,7 +69,7 @@ function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Wor
 
     # Load CRF.
     crf_wt = BSON.load(joinpath(weights_path, "crf_cpu.bson"))[:crf_Weights]
-    c = TextModels.CRF(crf_wt, size(crf_wt)[1] - 2)
+    c = CRF(crf_wt, size(crf_wt)[1] - 2)
 
     # Load Conv
     conv_wt_bias = BSON.load(joinpath(weights_path, "conv_cpu.bson"))
@@ -100,7 +100,7 @@ function (a::BiLSTM_CNN_CRF_Model)(x)
     oh_outs = viterbi_decode(a.c, m(x), a.init_α)
     Flux.reset!(a.backward)
     Flux.reset!(a.forward_lstm)
-    [a.labels[oh.ix] for oh in oh_outs]
+    [a.labels[oh.indices] for oh in oh_outs]
 end
 
 onehotinput(m::BiLSTM_CNN_CRF_Model, word) = (onehot(get(m.words_idx, lowercase(word), m.UNK_Word_idx), 1:length(m.words_idx)),
diff --git a/test/crf.jl b/test/crf.jl
index 34237d2..d88e32e 100644
--- a/test/crf.jl
+++ b/test/crf.jl
@@ -1,5 +1,6 @@
 using Flux
-using Flux: gradient, LSTM, Dense, reset!, onehot, RNN
+using Flux: LSTM, Dense, reset!, onehot, RNN
+using Zygote: gradient
 using TextModels: score_sequence, forward_score
 
 @testset "crf" begin
@@ -118,7 +119,7 @@ using TextModels: score_sequence, forward_score
         function train()
             for d in data
                 reset!(lstm)
-                grads = Tracker.gradient(() -> loss(d[1], d[2]), ps)
+                grads = gradient(() -> loss(d[1], d[2]), ps)
                 Flux.Optimise.update!(opt, ps, grads)
             end
         end
@@ -129,17 +130,17 @@ using TextModels: score_sequence, forward_score
         end
         to_sum = [find_loss(d) for d in data]
         l1 = sum(to_sum)
-        dense_param_1 = deepcopy(Tracker.data(d_out.W))
-        lstm_param_1 = deepcopy(Tracker.data(lstm.cell.Wh))
-        crf_param_1 = deepcopy(Tracker.data(c.W))
+        dense_param_1 = deepcopy(d_out.W)
+        lstm_param_1 = deepcopy(lstm.cell.Wh)
+        crf_param_1 = deepcopy(c.W)
 
         for i in 1:10
             train()
         end
 
-        dense_param_2 = deepcopy(Tracker.data(d_out.W))
-        lstm_param_2 = deepcopy(Tracker.data(lstm.cell.Wh))
-        crf_param_2 = deepcopy(Tracker.data(c.W))
+        dense_param_2 = deepcopy(d_out.W))
+        lstm_param_2 = deepcopy(lstm.cell.Wh)
+        crf_param_2 = deepcopy(c.W)
         l2 = sum([find_loss(d) for d in data])
 
         @test l1 > l2
diff --git a/test/runtests.jl b/test/runtests.jl
index 1bcac94..1221b31 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -8,5 +8,5 @@ include("crf.jl")
 include("ner.jl")
 include("pos.jl")
 include("averagePerceptronTagger.jl")
-include("ulmfit.jl")
+#include("ulmfit.jl")
 include("sentiment.jl")

From 3b614e07682d8c9ec1b6916f15a3238e09a98cdc Mon Sep 17 00:00:00 2001
From: Adarshkumar712 <adarshkumar712.ak@gmail.com>
Date: Fri, 30 Apr 2021 09:52:24 -0700
Subject: [PATCH 07/23] Update ULMFiT model

fix errors in training

Correction in code for Text Classifier

Remove gpu erro
---
 Project.toml                        |  4 +++
 src/TextModels.jl                   | 33 ++++++++---------
 src/ULMFiT/custom_layers.jl         | 55 +++++++++++++++++------------
 src/ULMFiT/data_loaders.jl          | 32 ++++++++---------
 src/ULMFiT/fine_tune_lm.jl          | 26 ++++++--------
 src/ULMFiT/pretrain_lm.jl           | 20 +++++------
 src/ULMFiT/sentiment.jl             |  4 +--
 src/ULMFiT/train_text_classifier.jl | 48 +++++++++++++++++--------
 src/ULMFiT/utils.jl                 |  4 +--
 test/ulmfit.jl                      |  6 ++--
 10 files changed, 129 insertions(+), 103 deletions(-)

diff --git a/Project.toml b/Project.toml
index 7d6b695..df1e03f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,12 +6,16 @@ version = "0.1.1"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
diff --git a/src/TextModels.jl b/src/TextModels.jl
index 79d6223..f437cb1 100644
--- a/src/TextModels.jl
+++ b/src/TextModels.jl
@@ -39,27 +39,28 @@ module TextModels
      
    
     # ULMFiT
-    #module ULMFiT
-    #    using ..TextAnalysis
-    #    using DataDeps
-    #    using Flux
-    #    using Tracker
-    #    using BSON
-    #    include("ULMFiT/utils.jl")
-    #    include("ULMFiT/datadeps.jl")
-    #    include("ULMFiT/data_loaders.jl")
-    #    include("ULMFiT/custom_layers.jl")
-    #    include("ULMFiT/pretrain_lm.jl")
-    #    include("ULMFiT/fine_tune_lm.jl")
-    #    include("ULMFiT/train_text_classifier.jl")
-    #end
-    #export ULMFiT
+    module ULMFiT
+        using TextAnalysis
+        using DataDeps
+        using Flux
+        using Zygote
+        using BSON
+        using CorpusLoaders
+        include("ULMFiT/utils.jl")
+        include("ULMFiT/datadeps.jl")
+        include("ULMFiT/data_loaders.jl")
+        include("ULMFiT/custom_layers.jl")
+        include("ULMFiT/pretrain_lm.jl")
+        include("ULMFiT/fine_tune_lm.jl")
+        include("ULMFiT/train_text_classifier.jl")
+    end
+    export ULMFiT
 
     function __init__()
         pos_tagger_datadep_register()
         ner_datadep_register()
         pos_datadep_register()
-        #ULMFiT.ulmfit_datadep_register()
+        ULMFiT.ulmfit_datadep_register()
     
         global sentiment_model = artifact"sentiment_model"
     end
diff --git a/src/ULMFiT/custom_layers.jl b/src/ULMFiT/custom_layers.jl
index e402c7d..ad6e906 100644
--- a/src/ULMFiT/custom_layers.jl
+++ b/src/ULMFiT/custom_layers.jl
@@ -8,7 +8,7 @@ This file contains the custom layers defined for this model:
     PooledDense
 """
 
-import Flux: gate, _testmode!, _dropout_kernel
+import Flux: gate, testmode!, _dropout_kernel
 
 reset_masks!(entity) = nothing
 reset_probability!(entity) = nothing
@@ -44,12 +44,12 @@ Moreover this also follows the Vartional DropOut citeria, that is,
 the drop mask is remains same for a whole training pass.
 This is done by saving the masks in 'maskWi' and 'maskWh' fields
 """
-mutable struct WeightDroppedLSTMCell{A, V, M}
+mutable struct WeightDroppedLSTMCell{A, V, S, M}
     Wi::A
     Wh::A
     b::V
-    h::V
-    c::V
+    h::S
+    c::S
     p::Float64
     maskWi::M
     maskWh::M
@@ -60,17 +60,17 @@ function WeightDroppedLSTMCell(in::Integer, out::Integer, p::Float64=0.0;
     init = Flux.glorot_uniform)
     @assert 0 ≤ p ≤ 1
     cell = WeightDroppedLSTMCell(
-        param(init(out*4, in)),
-        param(init(out*4, out)),
-        param(init(out*4)),
-        param(zeros(Float32, out)),
-        param(zeros(Float32, out)),
+        init(out*4, in),
+        init(out*4, out),
+        init(out*4),
+        reshape(zeros(Float32, out),out, 1),
+        reshape(zeros(Float32, out), out, 1),
         p,
         drop_mask((out*4, in), p),
         drop_mask((out*4, out), p),
         true
     )
-    cell.b.data[gate(out, 2)] .= 1
+    cell.b[gate(out, 2)] .= 1
     return cell
 end
 
@@ -88,9 +88,12 @@ function (m::WeightDroppedLSTMCell)((h, c), x)
     return (h′, c), h′
 end
 
-Flux.@treelike WeightDroppedLSTMCell
+Flux.@functor WeightDroppedLSTMCell
 
-_testmode!(m::WeightDroppedLSTMCell, test) = (m.active = !test)
+Flux.trainable(m::WeightDroppedLSTMCell) = (m.Wi, m.Wh, m.b, m.h, m.c)
+
+testmode!(m::WeightDroppedLSTMCell, mode=true) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 """
     WeightDroppedLSTM(in::Integer, out::Integer, p::Float64=0.0)
@@ -106,7 +109,7 @@ julia> wd = WeightDroppedLSTM(4, 5, 0.3);
 function WeightDroppedLSTM(a...; kw...)
     cell = WeightDroppedLSTMCell(a...;kw...)
     hidden = (cell.h, cell.c)
-    return Flux.Recur(cell, hidden, hidden)
+    return Flux.Recur(cell, hidden)
 end
 
 """
@@ -155,7 +158,9 @@ end
 
 AWD_LSTM(in::Integer, out::Integer, p::Float64=0.0; kw...) = AWD_LSTM(WeightDroppedLSTM(in, out, p; kw...), -1, [])
 
-Flux.@treelike AWD_LSTM
+Flux.@functor AWD_LSTM
+
+Flux.trainable(m::AWD_LSTM) = (m.layer,)
 
 (m::AWD_LSTM)(in) = m.layer(in)
 
@@ -184,12 +189,12 @@ function asgd_step!(iter::Integer, layer::AWD_LSTM)
         p = get_trainable_params([layer])
         avg_fact = 1/max(iter - layer.T + 1, 1)
         if avg_fact != 1
-            layer.accum = layer.accum .+ Tracker.data.(p)
+            layer.accum = layer.accum .+ p
             for (ps, accum) in zip(p, layer.accum)
-                Tracker.data(ps) .= avg_fact*accum
+                ps .= avg_fact*accum
             end
         else
-            layer.accum = deepcopy(Tracker.data.(p))   # Accumulator for ASGD
+            layer.accum = deepcopy(p)   # Accumulator for ASGD
         end
     end
     return
@@ -230,7 +235,8 @@ function (vd::VarDrop)(x)
     return (x .* vd.mask)
 end
 
-_testmode!(vd::VarDrop, test) = (vd.active = !test)
+testmode!(m::VarDrop, mode=true) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 # method for reseting mask of VarDrop
 reset_masks!(vd::VarDrop) = (vd.reset = true)
@@ -270,7 +276,7 @@ end
 function DroppedEmbeddings(in::Integer, embed_size::Integer, p::Float64=0.0;
     init = Flux.glorot_uniform)
         de = DroppedEmbeddings{AbstractArray, typeof(p)}(
-            param(init(in, embed_size)),
+            init(in, embed_size),
             p,
             drop_mask((in,), p),
             true
@@ -283,9 +289,12 @@ function (de::DroppedEmbeddings)(x::AbstractArray, tying::Bool=false)
     return tying ? dropped * x : transpose(dropped[x, :])
 end
 
-Flux.@treelike DroppedEmbeddings
+Flux.@functor DroppedEmbeddings
+
+Flux.trainable(m::DroppedEmbeddings) = (m.emb)
 
-_testmode!(de::DroppedEmbeddings, test) = (de.active = !test)
+testmode!(m::DroppedEmbeddings, mode=true) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function reset_masks!(de::DroppedEmbeddings)
     de.mask = drop_mask(de.mask, de.p)
@@ -324,10 +333,10 @@ PooledDense(W, b) = PooledDense(W, b, identity)
 
 function PooledDense(hidden_sz::Integer, out::Integer, σ = identity;
              initW = Flux.glorot_uniform, initb = (dims...) -> zeros(Float32, dims...))
-return PooledDense(param(initW(out, hidden_sz*3)), param(initb(out)), σ)
+return PooledDense(initW(out, hidden_sz*3), initb(out), σ)
 end
 
-Flux.@treelike PooledDense
+Flux.@functor PooledDense
 
 function (a::PooledDense)(x)
     W, b, σ = a.W, a.b, a.σ
diff --git a/src/ULMFiT/data_loaders.jl b/src/ULMFiT/data_loaders.jl
index f59e403..839b408 100644
--- a/src/ULMFiT/data_loaders.jl
+++ b/src/ULMFiT/data_loaders.jl
@@ -27,29 +27,29 @@ function imdb_preprocess(doc::AbstractDocument)
         length(word) == 1 && return [word]
         return split(word, symbol)
     end
-    text = text(doc)
-    remove_corrupt_utf8!(text)
-    remove_case!(text)
-    prepare!(text, strip_html_tags)
-    tokens = tokens(text)
+    text_ = doc
+    remove_corrupt_utf8!(text_)
+    remove_case!(text_)
+    prepare!(text_, strip_html_tags)
+    tokens_ = tokens(text_)
     for symbol in [',', '.', '-', '/', "'s"]
-        tokens = split_word.(tokens, symbol)
+        tokens_ = split_word.(tokens_, symbol)
         temp = []
-        for token in tokens
+        for token_ in tokens_
             try
-                append!(temp, put(token, symbol))
+                append!(temp, put(token_, symbol))
             catch
-                append!(temp, token)
+                append!(temp, token_)
             end
         end
-        tokens = temp
+        tokens_ = temp
     end
-    deleteat!(tokens, findall(x -> isequal(x, "")||isequal(x, " "), tokens))
-    return tokens
+    deleteat!(tokens_, findall(x -> isequal(x, "")||isequal(x, " "), tokens_))
+    return tokens_
 end
 
 # Loads WikiText-103 corpus and output a Channel to give a mini-batch at each call
-function load_wikitext_103(batchsize::Integer, bptt::Integer; type = "train")
+function load_wikitext_103(batchsize::Integer=16, bptt::Integer=70; type = "train")
     corpuspath = joinpath(datadep"WikiText-103", "wiki.$(type).tokens")
     corpus = read(open(corpuspath, "r"), String)
     corpus = tokenize(corpus)
@@ -58,13 +58,13 @@ end
 
 # IMDB Data loaders for Sentiment Analysis specifically
 # IMDB data loader for fine-tuning Language Model
-function imdb_fine_tune_data(batchsize::Integer, bptt::Integer, num_examples::Integer=50000)
+function imdb_fine_tune_data(batchsize::Integer=16, bptt::Integer=70, num_examples::Integer=50000)
     imdb_dataset = IMDB("train_unsup")
     dataset = []
-    for path in imdb_dataset.filepaths   #extract data from the files in directory and put into channel
+    for path in imdb_dataset.filepaths[1:num_examples]   #extract data from the files in directory and put into channel
         open(path) do fileio
             cur_text = read(fileio, String)
-            append!(dataset, imdb_preprocess(cur_text))
+            append!(dataset, imdb_preprocess(StringDocument(cur_text)))
         end #open
     end #for
     return Channel(x -> generator(x, dataset; batchsize=batchsize, bptt=bptt))
diff --git a/src/ULMFiT/fine_tune_lm.jl b/src/ULMFiT/fine_tune_lm.jl
index 17f33b9..b2e7261 100644
--- a/src/ULMFiT/fine_tune_lm.jl
+++ b/src/ULMFiT/fine_tune_lm.jl
@@ -24,17 +24,17 @@ opts        : `Vector` of optimizers used to update weights for corresponding la
 
 NOTE: length(opts) == length(layers)
 """
-function discriminative_step!(layers, ηL::Float64, l, opts::Vector)
+function discriminative_step!(layers, lm::LanguageModel, gen, ηL::Float64, opts::Vector)
     @assert length(opts) == length(layers)
     # Gradient calculation
-    grads = Tracker.gradient(() -> l, get_trainable_params(layers))
+    grads = Zygote.gradient(() -> loss(lm, gen), get_trainable_params(layers))
 
     # discriminative step
     ηl = ηL/(2.6^(length(layers)-1))
     for (layer, opt) in zip(layers, opts)
         opt.eta = ηl
         for ps in get_trainable_params([layer])
-            Tracker.update!(opt, ps, grads[ps])
+            Flux.Optimise.update!(opt, ps, grads[ps])
         end
         ηl *= 2.6
     end
@@ -50,32 +50,28 @@ This function contains main training loops for fine-tuning the language model.
 To use this funciton, an instance of LanguageModel and a data loader is needed.
 Read the docs for more info about arguments
 """
-function fine_tune_lm!(lm::LanguageModel, data_loader::Channel=imdb_fine_tune_data,
-        stlr_cut_frac::Float64=0.1, stlr_ratio::Float32=32, stlr_η_max::Float64=4e-3;
+function fine_tune_lm!(lm=LanguageModel(), data_loader=imdb_fine_tune_data,
+        stlr_cut_frac::Float64=0.1, stlr_ratio::Float32=Float32(32), stlr_η_max::Float64=4e-3;
         epochs::Integer=1, checkpoint_itvl::Integer=5000)
 
     opts = [ADAM(0.001, (0.7, 0.99)) for i=1:4]
-    cut = num_of_iters * epochs * stlr_cut_frac
-
+    
     # Fine-Tuning loops
     for epoch=1:epochs
         println("\nEpoch: $epoch")
-        gen = data_loader()
-        num_of_iters = take!(gen)
+        gen = data_loader() 
+	num_of_iters = take!(gen)
+	cut = num_of_iters * epochs * stlr_cut_frac
         T = num_of_iters-Int(floor((num_of_iters*2)/100))
         set_trigger!.(T, lm.layers)
         for i=1:num_of_iters
-
-            # FORWARD
-            l = loss(lm, gen)
-
             # Slanted triangular learning rate step
             t = i + (epoch-1)*num_of_iters
             p_frac = (i < cut) ? i/cut : (1 - ((i-cut)/(cut*(1/stlr_cut_frac-1))))
             ηL = stlr_η_max*((1+p_frac*(stlr_ratio-1))/stlr_ratio)
 
             # Backprop with discriminative fine-tuning step
-            discriminative_step!(lm.layers[[1, 3, 5, 7]], ηL, l, opts)
+            discriminative_step!(lm.layers[[1, 3, 5, 7]], lm, gen, ηL, opts)
 
             # Resets dropout masks for all the layers with DropOut or DropConnect
             reset_masks!.(lm.layers)
@@ -121,7 +117,7 @@ julia> insert!(vocab, 2, "_pad_")
 function set_vocab!(lm::LanguageModel, vocab::Vector)
     idxs = indices(vocab, lm.vocab)
     lm.vocab = vocab
-    lm.layers[1].emb = param(Tracker.data(lm.layers[1].emb)[idxs, :])
+    lm.layers[1].emb = param(lm.layers[1].emb[idxs, :])
     lm.layers[1].mask = gpu(drop_mask((length(vocab),), lm.layers[1].p))
     return
 end
diff --git a/src/ULMFiT/pretrain_lm.jl b/src/ULMFiT/pretrain_lm.jl
index 74bc573..1afd48d 100644
--- a/src/ULMFiT/pretrain_lm.jl
+++ b/src/ULMFiT/pretrain_lm.jl
@@ -49,7 +49,7 @@ function LanguageModel(load_pretrained::Bool=false, vocabpath::String=joinpath(@
     return lm
 end
 
-Flux.@treelike LanguageModel
+Flux.@functor LanguageModel
 
 """
     test_lm(lm::LanguageModel, data_gen, num_of_iters::Integer; unknown_token::String="_unk_")
@@ -63,7 +63,7 @@ It returns loss, accuracy, precsion, recall and F1 score.
 julia> test_lm(lm, data_gen, 200, "<unk")
 """
 function test_lm(lm::LanguageModel, data_gen, num_of_iters::Integer; unknown_token::String="_unk_")
-    model_layers = mapleaves(Tracker.data, lm.layers)
+    model_layers = lm.layers
     testmode!(model_layers)
     loss = 0
     len = length(vocab)
@@ -93,6 +93,7 @@ end
 # computes the forward pass while training
 function forward(lm, batch)
     batch = map(x -> indices(x, lm.vocab, "_unk_"), batch)
+    batch = gpu(batch)
     batch = lm.layers.(batch)
     return batch
 end
@@ -107,11 +108,11 @@ function loss(lm, gen)
 end
 
 # Backpropagation step while training
-function backward!(layers, l, opt)
+function backward!(layers, lm, gen, opt)
     # Calulating gradients and weights updation
     p = get_trainable_params(layers)
-    grads = Tracker.gradient(() -> l, p)
-    Tracker.update!(opt, p, grads)
+    grads = Zygote.gradient(() -> loss(lm, gen), p)
+    Flux.Optimise.update!(opt, p, grads)
     return
 end
 
@@ -138,11 +139,8 @@ function pretrain_lm!(lm::LanguageModel=LanguageModel(), data_loader::Channel=lo
         set_trigger!.(T, lm.layers)  # Setting triggers for AWD_LSTM layers
         for i=1:num_of_batches
 
-            # FORWARD PASS
-            l = loss(lm, gen)
-
             # REVERSE PASS
-            backward!(lm.layers, l, opt)
+            backward!(lm.layers, lm, gen, opt)
 
             # ASGD Step, works after Triggering
             asgd_step!.(i, lm.layers)
@@ -158,7 +156,7 @@ end
 
 # To save model
 function save_model!(m::LanguageModel, filepath::String)
-    weights = cpu.(Tracker.data.(params(m)))
+    weights = cpu.(params(m))
     BSON.@save filepath weights
 end
 
@@ -182,7 +180,7 @@ SAMPLING...
 """
 function sample(starting_text::AbstractDocument, lm::LanguageModel)
     testmode!(lm.layers)
-    model_layers = mapleaves(Tracker.data, lm.layers)
+    model_layers = lm.layers
     tokens = tokens(starting_text)
     word_indices = map(x -> indices([x], lm.vocab, "_unk_"), tokens)
     h = (model_layers.(word_indices))[end]
diff --git a/src/ULMFiT/sentiment.jl b/src/ULMFiT/sentiment.jl
index c70069d..3ab5479 100644
--- a/src/ULMFiT/sentiment.jl
+++ b/src/ULMFiT/sentiment.jl
@@ -48,12 +48,12 @@ function BinSentimentClassifier()
         )
     )
     Flux.loadparams!(sc, weights)
-    sc = mapleaves(Tracker.data, sc)
+    sc = sc
     Flux.testmode!(sc)
     return sc
 end
 
-Flux.@treelike BinSentimentClassifier
+Flux.@functor BinSentimentClassifier
 
 function (sc::BinSentimentClassifier)(x::TokenDocument)
     remove_case!(x)
diff --git a/src/ULMFiT/train_text_classifier.jl b/src/ULMFiT/train_text_classifier.jl
index e30912f..702bd21 100644
--- a/src/ULMFiT/train_text_classifier.jl
+++ b/src/ULMFiT/train_text_classifier.jl
@@ -30,7 +30,7 @@ function TextClassifier(lm::LanguageModel=LanguageModel(), clsfr_out_sz::Integer
     )
 end
 
-Flux.@treelike TextClassifier
+Flux.@functor TextClassifier
 
 """
 Cross Validate
@@ -48,7 +48,7 @@ gen will be used for validation
 """
 function validate(tc::TextClassifier, gen::Channel, num_of_batches::Union{Colon, Integer})
     n_classes = size(tc.linear_layers[end-2].W, 1)
-    classifier = mapleaves(Tracker.data, tc)
+    classifier = tc
     Flux.testmode!(classifier)
     loss = 0
     iters = take!(gen)
@@ -91,15 +91,17 @@ tracked_steps   : This is the number of tracked time-steps for Truncated Backpro
 """
 function forward(tc::TextClassifier, gen::Channel, tracked_steps::Integer=32)
   	# swiching off tracking
-    classifier = mapleaves(Tracker.data, tc)
+    classifier = tc
     X = take!(gen)
     l = length(X)
     # Truncated Backprop through time
-    for i=1:ceil(l/now_per_pass)-1   # Tracking is swiched off inside this loop
-        (i == 1 && l%now_per_pass != 0) ? (last_idx = l%now_per_pass) : (last_idx = now_per_pass)
-        H = broadcast(x -> indices(x, classifier.vocab, "_unk_"), X[1:last_idx])
-        H = classifier.rnn_layers.(H)
-        X = X[last_idx+1:end]
+    Zygote.ignore() do
+	for i=1:ceil(l/tracked_steps)-1   # Tracking is swiched off inside this loop
+	    (i == 1 && l%tracked_steps != 0) ? (last_idx = l%tracked_steps) : (last_idx = tracked_steps)
+	    H = broadcast(x -> indices(x, classifier.vocab, "_unk_"), X[1:last_idx])
+	    H = classifier.rnn_layers.(H)
+	    X = X[last_idx+1:end]
+	end
     end
     # set the lated hidden states to original model
     for (t_layer, unt_layer) in zip(tc.rnn_layers[2:end], classifier.rnn_layers[2:end])
@@ -130,7 +132,7 @@ Arguments:
 
 classifier    : Instance of TextClassifier
 gen           : 'Channel' [data loader], to give a mini-batch
-tracked_words : specifies the number of time-steps for which tracking is on
+tracked_steps : specifies the number of time-steps for which tracking is on
 """
 function loss(classifier::TextClassifier, gen::Channel, tracked_steps::Integer=32)
     H = forward(classifier, gen, tracked_steps)
@@ -140,6 +142,23 @@ function loss(classifier::TextClassifier, gen::Channel, tracked_steps::Integer=3
     return l
 end
 
+function discriminative_step!(layers, classifier::TextClassifier, gen::Channel, tracked_steps::Integer, ηL::Float64, opts::Vector)
+    @assert length(opts) == length(layers)
+    # Gradient calculation
+    grads = Zygote.gradient(() -> loss(classifier, gen, tracked_steps = tracked_steps), get_trainable_params(layers))
+
+    # discriminative step
+    ηl = ηL/(2.6^(length(layers)-1))
+    for (layer, opt) in zip(layers, opts)
+        opt.eta = ηl
+        for ps in get_trainable_params([layer])
+            Flux.Optimise.update!(opt, ps, grads[ps])
+        end
+        ηl *= 2.6
+    end
+    return
+end
+
 """
     train_classifier!(classifier::TextClassifier=TextClassifier(), classes::Integer=1,
             data_loader::Channel=imdb_classifier_data, hidden_layer_size::Integer=50;kw...)
@@ -151,7 +170,7 @@ function train_classifier!(classifier::TextClassifier=TextClassifier(), classes:
     data_loader::Channel=imdb_classifier_data, hidden_layer_size::Integer=50;
     stlr_cut_frac::Float64=0.1, stlr_ratio::Number=32, stlr_η_max::Float64=0.01,
     val_loader::Channel=nothing, cross_val_batches::Union{Colon, Integer}=:,
-    epochs::Integer=1, checkpoint_itvl=5000)
+    epochs::Integer=1, checkpoint_itvl=5000, tracked_steps::Integer=32)
 
     trainable = []
     append!(trainable, [classifier.rnn_layers[[1, 3, 5, 7]]...])
@@ -166,7 +185,6 @@ function train_classifier!(classifier::TextClassifier=TextClassifier(), classes:
         num_of_iters = take!(gen)
         cut = num_of_iters * epochs * stlr_cut_frac
         for iter=1:num_of_iters
-            l = loss(classifier, gen, now_per_pass = now_per_pass)
 
             # Slanted triangular learning rates
             t = iter + (epoch-1)*num_of_iters
@@ -175,7 +193,7 @@ function train_classifier!(classifier::TextClassifier=TextClassifier(), classes:
 
             # Gradual-unfreezing Step with discriminative fine-tuning
             unfreezed_layers, cur_opts = (epoch < length(trainable)) ? (trainable[end-epoch+1:end], opts[end-epoch+1:end]) : (trainable, opts)
-            discriminative_step!(unfreezed_layers, ηL, l, cur_opts)
+            discriminative_step!(unfreezed_layers, classifier, gen, tracked_steps,ηL, cur_opts)
 
             reset_masks!.(classifier.rnn_layers)    # reset all dropout masks
         end
@@ -203,13 +221,13 @@ All the preprocessing related to the used vocabulary should be done before using
 Use `prepare!` function to do preprocessing
 """
 function predict(tc::TextClassifier, text_sents::Corpus)
-    classifier = mapleaves(Tracker.data, tc)
+    classifier = tc
     Flux.testmode!(classifier)
     predictions = []
     expr(x) = indices(x, classifier.vocab, "_unk_")
     for text in text_sents
-        tokens = tokens(text)
-        h = classifier.rnn_layers.(expr.(tokens))
+        tokens_ = tokens(text)
+        h = classifier.rnn_layers.(expr.(tokens_))
         probability_dist = classifier.linear_layers(h)
         class = argmax(probaility_dist)
         push!(predictions, class)
diff --git a/src/ULMFiT/utils.jl b/src/ULMFiT/utils.jl
index 691354f..64bfd11 100644
--- a/src/ULMFiT/utils.jl
+++ b/src/ULMFiT/utils.jl
@@ -27,8 +27,8 @@ end
 init_weights(extreme::AbstractFloat, dims...) = randn(Float32, dims...) .* sqrt(Float32(extreme))
 
 # Generator, whenever it should be called two times since it gives X in first and y in second call
-function generator(c::Channel, corpus::AbstractDocument; batchsize::Integer=64, bptt::Integer=70)
-    X_total = post_pad_sequences(chunk(tokens(corpus), batchsize))
+function generator(c::Channel, corpus; batchsize::Integer=64, bptt::Integer=70)
+    X_total = post_pad_sequences(Flux.chunk(corpus, batchsize))
     n_batches = Int(floor(length(X_total[1])/bptt))
     put!(c, n_batches)
     for i=1:n_batches
diff --git a/test/ulmfit.jl b/test/ulmfit.jl
index 8ea0092..b3820f9 100644
--- a/test/ulmfit.jl
+++ b/test/ulmfit.jl
@@ -4,7 +4,7 @@ using BSON
 @testset "Custom layers" begin
     @testset "WeightDroppedLSTM" begin
         wd = ULMFiT.WeightDroppedLSTM(4, 5, 0.3)
-        @test all(wd.init .== wd.state)
+        @test all((wd.cell.h, wd.cell.c) .== wd.state)
         @test size(wd.cell.Wi) == size(wd.cell.maskWi)
         @test size(wd.cell.Wh) == size(wd.cell.maskWh)
         @test wd.cell.active
@@ -31,10 +31,10 @@ using BSON
         ULMFiT.asgd_step!(4, awd)
         @test length(awd.accum) == 3
         temp = deepcopy(awd.accum[1][1])
-        @test temp == Tracker.data(awd.layer.cell.Wi[1])
+        @test temp == awd.layer.cell.Wi[1]
         ULMFiT.asgd_step!(5, awd)
         temp += temp
-        @test temp == Tracker.data(awd.accum[1][1])
+        @test temp == awd.accum[1][1]
         @test length(params(awd)) == 5
     end
 

From 95550b9d6af45e451d0b916eb9d1d38abc1190e8 Mon Sep 17 00:00:00 2001
From: Adarshkumar712 <adarshkumar712.ak@gmail.com>
Date: Sat, 1 May 2021 04:45:15 -0700
Subject: [PATCH 08/23] change truncate! to reset!

---
 src/TextModels.jl                   |  1 +
 src/ULMFiT/custom_layers.jl         | 13 ++++++-------
 src/ULMFiT/pretrain_lm.jl           |  4 ++--
 src/ULMFiT/train_text_classifier.jl |  2 +-
 test/runtests.jl                    | 10 +++++-----
 test/ulmfit.jl                      |  2 +-
 6 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/TextModels.jl b/src/TextModels.jl
index f437cb1..9f8b5ca 100644
--- a/src/TextModels.jl
+++ b/src/TextModels.jl
@@ -43,6 +43,7 @@ module TextModels
         using TextAnalysis
         using DataDeps
         using Flux
+        using Flux:crossentropy
         using Zygote
         using BSON
         using CorpusLoaders
diff --git a/src/ULMFiT/custom_layers.jl b/src/ULMFiT/custom_layers.jl
index ad6e906..a07fd4a 100644
--- a/src/ULMFiT/custom_layers.jl
+++ b/src/ULMFiT/custom_layers.jl
@@ -48,8 +48,7 @@ mutable struct WeightDroppedLSTMCell{A, V, S, M}
     Wi::A
     Wh::A
     b::V
-    h::S
-    c::S
+    state0::S
     p::Float64
     maskWi::M
     maskWh::M
@@ -63,8 +62,8 @@ function WeightDroppedLSTMCell(in::Integer, out::Integer, p::Float64=0.0;
         init(out*4, in),
         init(out*4, out),
         init(out*4),
-        reshape(zeros(Float32, out),out, 1),
-        reshape(zeros(Float32, out), out, 1),
+        (reshape(zeros(Float32, out),out, 1),
+        reshape(zeros(Float32, out), out, 1)),
         p,
         drop_mask((out*4, in), p),
         drop_mask((out*4, out), p),
@@ -90,7 +89,7 @@ end
 
 Flux.@functor WeightDroppedLSTMCell
 
-Flux.trainable(m::WeightDroppedLSTMCell) = (m.Wi, m.Wh, m.b, m.h, m.c)
+Flux.trainable(m::WeightDroppedLSTMCell) = (m.Wi, m.Wh, m.b, m.state0...)
 
 testmode!(m::WeightDroppedLSTMCell, mode=true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
@@ -108,7 +107,7 @@ julia> wd = WeightDroppedLSTM(4, 5, 0.3);
 """
 function WeightDroppedLSTM(a...; kw...)
     cell = WeightDroppedLSTMCell(a...;kw...)
-    hidden = (cell.h, cell.c)
+    hidden = cell.state0
     return Flux.Recur(cell, hidden)
 end
 
@@ -291,7 +290,7 @@ end
 
 Flux.@functor DroppedEmbeddings
 
-Flux.trainable(m::DroppedEmbeddings) = (m.emb)
+Flux.trainable(m::DroppedEmbeddings) = (m.emb,)
 
 testmode!(m::DroppedEmbeddings, mode=true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
diff --git a/src/ULMFiT/pretrain_lm.jl b/src/ULMFiT/pretrain_lm.jl
index 1afd48d..6f5ca3a 100644
--- a/src/ULMFiT/pretrain_lm.jl
+++ b/src/ULMFiT/pretrain_lm.jl
@@ -102,8 +102,8 @@ end
 function loss(lm, gen)
     H = forward(lm, take!(gen))
     Y = broadcast(x -> gpu(Flux.onehotbatch(x, lm.vocab, "_unk_")), take!(gen))
-    l = sum(crossentropy.(H, Y))
-    Flux.truncate!(lm.layers)
+    l = sum(Flux.crossentropy.(H, Y))
+    Flux.reset!(lm.layers)
     return l
 end
 
diff --git a/src/ULMFiT/train_text_classifier.jl b/src/ULMFiT/train_text_classifier.jl
index 702bd21..2530f67 100644
--- a/src/ULMFiT/train_text_classifier.jl
+++ b/src/ULMFiT/train_text_classifier.jl
@@ -20,7 +20,7 @@ function TextClassifier(lm::LanguageModel=LanguageModel(), clsfr_out_sz::Integer
         lm.vocab,
         lm.layers[1:8],
         Chain(
-            gpu(PooledDense(length(lm.layers[7].layer.cell.h), clsfr_hidden_sz)),
+            gpu(PooledDense(length(lm.layers[7].layer.cell.state0[1]), clsfr_hidden_sz)),
             gpu(BatchNorm(clsfr_hidden_sz, relu)),
             Dropout(clsfr_hidden_drop),
             gpu(Dense(clsfr_hidden_sz, clsfr_out_sz)),
diff --git a/test/runtests.jl b/test/runtests.jl
index 1221b31..dae92ff 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -5,8 +5,8 @@ using TextModels
 println("Running tests:")
 
 include("crf.jl")
-include("ner.jl")
-include("pos.jl")
-include("averagePerceptronTagger.jl")
-#include("ulmfit.jl")
-include("sentiment.jl")
+#include("ner.jl")
+#include("pos.jl")
+#include("averagePerceptronTagger.jl")
+include("ulmfit.jl")
+#include("sentiment.jl")
diff --git a/test/ulmfit.jl b/test/ulmfit.jl
index b3820f9..cdd8fdd 100644
--- a/test/ulmfit.jl
+++ b/test/ulmfit.jl
@@ -4,7 +4,7 @@ using BSON
 @testset "Custom layers" begin
     @testset "WeightDroppedLSTM" begin
         wd = ULMFiT.WeightDroppedLSTM(4, 5, 0.3)
-        @test all((wd.cell.h, wd.cell.c) .== wd.state)
+        @test all((wd.cell.state0) .== wd.state)
         @test size(wd.cell.Wi) == size(wd.cell.maskWi)
         @test size(wd.cell.Wh) == size(wd.cell.maskWh)
         @test wd.cell.active

From 0628c2df320bf39b380c880b307f8e2c23cde288 Mon Sep 17 00:00:00 2001
From: Adarshkumar712 <adarshkumar712.ak@gmail.com>
Date: Sat, 1 May 2021 07:51:28 -0700
Subject: [PATCH 09/23] Updated and verified tests

---
 src/ULMFiT/custom_layers.jl         | 21 ++++++++++++++++-----
 src/ULMFiT/pretrain_lm.jl           |  2 +-
 src/ULMFiT/train_text_classifier.jl |  2 +-
 test/crf.jl                         |  6 +++---
 test/runtests.jl                    |  7 +++----
 test/ulmfit.jl                      |  2 +-
 6 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/ULMFiT/custom_layers.jl b/src/ULMFiT/custom_layers.jl
index a07fd4a..c0275a6 100644
--- a/src/ULMFiT/custom_layers.jl
+++ b/src/ULMFiT/custom_layers.jl
@@ -48,7 +48,8 @@ mutable struct WeightDroppedLSTMCell{A, V, S, M}
     Wi::A
     Wh::A
     b::V
-    state0::S
+    h::S
+    c::S
     p::Float64
     maskWi::M
     maskWh::M
@@ -62,8 +63,8 @@ function WeightDroppedLSTMCell(in::Integer, out::Integer, p::Float64=0.0;
         init(out*4, in),
         init(out*4, out),
         init(out*4),
-        (reshape(zeros(Float32, out),out, 1),
-        reshape(zeros(Float32, out), out, 1)),
+        reshape(zeros(Float32, out),out, 1),
+        reshape(zeros(Float32, out), out, 1),
         p,
         drop_mask((out*4, in), p),
         drop_mask((out*4, out), p),
@@ -89,7 +90,7 @@ end
 
 Flux.@functor WeightDroppedLSTMCell
 
-Flux.trainable(m::WeightDroppedLSTMCell) = (m.Wi, m.Wh, m.b, m.state0...)
+Flux.trainable(m::WeightDroppedLSTMCell) = (m.Wi, m.Wh, m.b, m.h, m.c)
 
 testmode!(m::WeightDroppedLSTMCell, mode=true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
@@ -107,10 +108,20 @@ julia> wd = WeightDroppedLSTM(4, 5, 0.3);
 """
 function WeightDroppedLSTM(a...; kw...)
     cell = WeightDroppedLSTMCell(a...;kw...)
-    hidden = cell.state0
+    hidden = (cell.h, cell.c)
     return Flux.Recur(cell, hidden)
 end
 
+# over definition for reset! to work with pretrained model
+function reset!(m)
+    try
+        (m.state = (m.cell.h, m.cell.c))
+    catch
+    	Flux.reset!(m)
+    end
+end
+   
+
 """
     reset_masks!(layer)
 
diff --git a/src/ULMFiT/pretrain_lm.jl b/src/ULMFiT/pretrain_lm.jl
index 6f5ca3a..1a59112 100644
--- a/src/ULMFiT/pretrain_lm.jl
+++ b/src/ULMFiT/pretrain_lm.jl
@@ -103,7 +103,7 @@ function loss(lm, gen)
     H = forward(lm, take!(gen))
     Y = broadcast(x -> gpu(Flux.onehotbatch(x, lm.vocab, "_unk_")), take!(gen))
     l = sum(Flux.crossentropy.(H, Y))
-    Flux.reset!(lm.layers)
+    reset!(lm.layers)
     return l
 end
 
diff --git a/src/ULMFiT/train_text_classifier.jl b/src/ULMFiT/train_text_classifier.jl
index 2530f67..702bd21 100644
--- a/src/ULMFiT/train_text_classifier.jl
+++ b/src/ULMFiT/train_text_classifier.jl
@@ -20,7 +20,7 @@ function TextClassifier(lm::LanguageModel=LanguageModel(), clsfr_out_sz::Integer
         lm.vocab,
         lm.layers[1:8],
         Chain(
-            gpu(PooledDense(length(lm.layers[7].layer.cell.state0[1]), clsfr_hidden_sz)),
+            gpu(PooledDense(length(lm.layers[7].layer.cell.h), clsfr_hidden_sz)),
             gpu(BatchNorm(clsfr_hidden_sz, relu)),
             Dropout(clsfr_hidden_drop),
             gpu(Dense(clsfr_hidden_sz, clsfr_out_sz)),
diff --git a/test/crf.jl b/test/crf.jl
index d88e32e..3f9246a 100644
--- a/test/crf.jl
+++ b/test/crf.jl
@@ -118,14 +118,14 @@ using TextModels: score_sequence, forward_score
 
         function train()
             for d in data
-                reset!(lstm)
+                Flux.reset!(lstm)
                 grads = gradient(() -> loss(d[1], d[2]), ps)
                 Flux.Optimise.update!(opt, ps, grads)
             end
         end
 
         function find_loss(d)
-            reset!(lstm)
+            Flux.reset!(lstm)
             loss(d[1], d[2])
         end
         to_sum = [find_loss(d) for d in data]
@@ -138,7 +138,7 @@ using TextModels: score_sequence, forward_score
             train()
         end
 
-        dense_param_2 = deepcopy(d_out.W))
+        dense_param_2 = deepcopy(d_out.W)
         lstm_param_2 = deepcopy(lstm.cell.Wh)
         crf_param_2 = deepcopy(c.W)
         l2 = sum([find_loss(d) for d in data])
diff --git a/test/runtests.jl b/test/runtests.jl
index dae92ff..f28f076 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -5,8 +5,7 @@ using TextModels
 println("Running tests:")
 
 include("crf.jl")
-#include("ner.jl")
-#include("pos.jl")
-#include("averagePerceptronTagger.jl")
+include("ner.jl")
+include("pos.jl")
+include("averagePerceptronTagger.jl")
 include("ulmfit.jl")
-#include("sentiment.jl")
diff --git a/test/ulmfit.jl b/test/ulmfit.jl
index cdd8fdd..b3820f9 100644
--- a/test/ulmfit.jl
+++ b/test/ulmfit.jl
@@ -4,7 +4,7 @@ using BSON
 @testset "Custom layers" begin
     @testset "WeightDroppedLSTM" begin
         wd = ULMFiT.WeightDroppedLSTM(4, 5, 0.3)
-        @test all((wd.cell.state0) .== wd.state)
+        @test all((wd.cell.h, wd.cell.c) .== wd.state)
         @test size(wd.cell.Wi) == size(wd.cell.maskWi)
         @test size(wd.cell.Wh) == size(wd.cell.maskWh)
         @test wd.cell.active

From 94edf5f90b0a111380409c284c7cac2e120b59ea Mon Sep 17 00:00:00 2001
From: Adarshkumar712 <adarshkumar712.ak@gmail.com>
Date: Tue, 18 May 2021 05:05:59 -0700
Subject: [PATCH 10/23] Update CI build to 1.6

---
 .github/workflows/ci.yml | 11 +++++------
 .travis.yml              |  7 +++----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 67b437d..8e04faf 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -13,8 +13,7 @@ jobs:
     strategy:
       matrix:
         version:
-          - '1.3'
-          - '1'
+          - '1.6'
           - 'nightly'
         os:
           - ubuntu-latest
@@ -29,12 +28,12 @@ jobs:
           # MacOS not available on x86
           - {os: 'macOS-latest', arch: 'x86'}
           # Don't test on all versions
-          - {os: 'macOS-latest', version: '1.3'}
+          - {os: 'macOS-latest', version: '1.6'}
           - {os: 'macOS-latest', version: 'nightly'}
-          - {os: 'windows-latest', version: '1.3'}
+          - {os: 'windows-latest', version: '1.6'}
           - {os: 'windows-latest', version: 'nightly'}
           - {os: 'windows-latest', arch: 'x86'}
-          - {arch: 'x86', version: '1.3'}
+          - {arch: 'x86', version: '1.6'}
           - {arch: 'x86', version: 'nightly'}
     steps:
       - uses: actions/checkout@v1
@@ -55,7 +54,7 @@ jobs:
       - uses: actions/checkout@v1
       - uses: julia-actions/setup-julia@latest
         with:
-          version: '1.5'
+          version: '1.6'
       - run: julia --project=docs -e '
           using Pkg;
           Pkg.develop(PackageSpec(; path=pwd()));
diff --git a/.travis.yml b/.travis.yml
index bf028c9..8e8320a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,17 +6,16 @@ os:
 env:
   - DATADEPS_ALWAYS_ACCEPT=true
 julia:
-  - 1.3
-  - 1
+  - 1.6
   - nightly
 matrix:
   allow_failures:
     - julia: nightly
   exclude:
     - os: osx
-      julia: 1.3
+      julia: 1.6
     - os: windows
-      julia: 1.3
+      julia: 1.6
     - os: osx
       julia: nightly
     - os: windows

From 8398f61d158585426fe962f1915ef0576a6eaec2 Mon Sep 17 00:00:00 2001
From: Adarshkumar712 <adarshkumar712.ak@gmail.com>
Date: Fri, 21 May 2021 04:03:27 -0700
Subject: [PATCH 11/23] Update CorpusLoaders version

---
 Project.toml      | 2 +-
 src/TextModels.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index df1e03f..e8a08dd 100644
--- a/Project.toml
+++ b/Project.toml
@@ -15,6 +15,7 @@ Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
@@ -24,7 +25,6 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 BSON = "0.3.3"
-DataDeps = "0.7.7"
 DataStructures = "0.18.9"
 Flux = "0.12.2"
 JSON = "0.21.1"
diff --git a/src/TextModels.jl b/src/TextModels.jl
index 9f8b5ca..5c88496 100644
--- a/src/TextModels.jl
+++ b/src/TextModels.jl
@@ -41,12 +41,12 @@ module TextModels
     # ULMFiT
     module ULMFiT
         using TextAnalysis
-        using DataDeps
         using Flux
         using Flux:crossentropy
         using Zygote
         using BSON
         using CorpusLoaders
+        using DataDeps
         include("ULMFiT/utils.jl")
         include("ULMFiT/datadeps.jl")
         include("ULMFiT/data_loaders.jl")

From 6cd9824188b96d7c35c8c093eb916ddcfb49badd Mon Sep 17 00:00:00 2001
From: Adarshkumar712 <adarshkumar712.ak@gmail.com>
Date: Fri, 21 May 2021 05:08:55 -0700
Subject: [PATCH 12/23] Reshape pretrained weights ULMFiT LM

---
 src/ULMFiT/pretrain_lm.jl | 5 +++++
 test/ulmfit.jl            | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/src/ULMFiT/pretrain_lm.jl b/src/ULMFiT/pretrain_lm.jl
index 1a59112..e659f8e 100644
--- a/src/ULMFiT/pretrain_lm.jl
+++ b/src/ULMFiT/pretrain_lm.jl
@@ -163,6 +163,11 @@ end
 # To load model
 function load_model!(lm::LanguageModel, filepath::String)
     BSON.@load filepath weights
+    # reshape saved weights to match Recurr (h, c) shape
+    layers = [5, 6, 10, 11, 15, 16]
+    for l in layers
+        weights[l] = reshape(weights[l], length(weights[l]), 1)
+    end
     Flux.loadparams!(lm, weights)
 end
 
diff --git a/test/ulmfit.jl b/test/ulmfit.jl
index b3820f9..3deca62 100644
--- a/test/ulmfit.jl
+++ b/test/ulmfit.jl
@@ -95,6 +95,12 @@ end
     @test length(ULMFiT.get_trainable_params(lm.layers)) == 10
 
     pretrained_weights = BSON.load(datadep"Pretrained ULMFiT Language Model/ulmfit_lm_en.bson")
+    # reshape weights of (h, c) 
+    layers = [5, 6, 10, 11, 15, 16]
+    for i in layers 
+       pretrained_weights[:weights][i] = reshape(pretrained_weights[:weights][i], length(pretrained_weights[:weights][i]), 1)
+    end
+
     @test length(pretrained_weights[:weights]) == 16
     @test all(size.(params(lm)) .== size.(pretrained_weights[:weights]))
 end

From 47015785632dc881a079794b1f1456d585b7e401 Mon Sep 17 00:00:00 2001
From: Adarshkumar712 <adarshkumar712.ak@gmail.com>
Date: Fri, 21 May 2021 09:25:03 -0700
Subject: [PATCH 13/23] Update crf test

---
 src/CRF/loss.jl | 2 +-
 test/crf.jl     | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/CRF/loss.jl b/src/CRF/loss.jl
index c1405fd..32501bd 100644
--- a/src/CRF/loss.jl
+++ b/src/CRF/loss.jl
@@ -5,7 +5,7 @@ Compute the Normalization / partition function
 or the Forward Algorithm score - `Z`
 """
 function forward_score(c::CRF, x, init_α)
-    forward_var = log_sum_exp((c.W .+ (x[1]') .+ init_α))
+    forward_var = log_sum_exp(c.W .+ x[1]' .+ init_α)
 
     for i in 2:length(x)
         forward_var = log_sum_exp((c.W .+ x[i]') .+ forward_var')
diff --git a/test/crf.jl b/test/crf.jl
index 3f9246a..a548a4b 100644
--- a/test/crf.jl
+++ b/test/crf.jl
@@ -1,6 +1,5 @@
 using Flux
-using Flux: LSTM, Dense, reset!, onehot, RNN
-using Zygote: gradient
+using Flux: gradient, LSTM, Dense, reset!, onehot, RNN, params
 using TextModels: score_sequence, forward_score
 
 @testset "crf" begin
@@ -109,7 +108,7 @@ using TextModels: score_sequence, forward_score
         init_α = fill(-10000, (c.n + 2, 1))
         init_α[c.n + 1] = 0
 
-        loss(xs, ys) = crf_loss(c, m(xs), ys, init_α)
+        loss(xs, ys) = crf_loss(c, m(xs), ys, init_α) + 1e-4*sum(c.W.*c.W)
 
         opt = Descent(0.01)
         data = zip(X, Y)
@@ -149,3 +148,4 @@ using TextModels: score_sequence, forward_score
         @test crf_param_1 != crf_param_2
     end
 end
+

From 2251a40697abbe8d686798fe2bae02232b3d2fe5 Mon Sep 17 00:00:00 2001
From: Adarshkumar712 <adarshkumar712.ak@gmail.com>
Date: Mon, 24 May 2021 05:18:15 -0700
Subject: [PATCH 14/23] Move docs from TextAnalysis

---
 docs/make.jl          |   4 +-
 docs/src/sentiment.md |  41 ++++++++
 docs/src/tagging.md   | 237 ++++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl      |   1 +
 4 files changed, 282 insertions(+), 1 deletion(-)
 create mode 100644 docs/src/sentiment.md
 create mode 100644 docs/src/tagging.md

diff --git a/docs/make.jl b/docs/make.jl
index 5876f79..a2f72fd 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -8,8 +8,10 @@ makedocs(
     pages = [
         "Home" => "index.md",
         "Conditional Random Fields" => "crf.md",
-        "Named Entity Recognition" => "ner.md",
         "ULMFiT" => "ULMFiT.md",
+        "Named Entity Recognition" => "ner.md",
+        "Tagging Schemes" => "tagging.md.md",
+        "Sentiment Analyzer" => "sentiment.md",
         "API References" => "APIReference.md"
     ],
 )
diff --git a/docs/src/sentiment.md b/docs/src/sentiment.md
new file mode 100644
index 0000000..e2cfe57
--- /dev/null
+++ b/docs/src/sentiment.md
@@ -0,0 +1,41 @@
+## Sentiment Analyzer
+
+It can be used to find the sentiment score (between 0 and 1) of a word, sentence or a Document.
+A trained model (using Flux) on IMDB word corpus with weights saved are used to calculate the sentiments.
+
+    model = SentimentAnalyzer()
+    model(doc)
+    model(doc, handle_unknown)
+
+*  doc              = Input Document for calculating document (AbstractDocument type)
+*  handle_unknown   = A function for handling unknown words. Should return an array (default (x)->[])
+
+```julia
+julia> using TextAnalysis
+
+julia> m = SentimentAnalyzer()
+Sentiment Analysis Model Trained on IMDB with a 88587 word corpus
+
+julia> d1 = StringDocument("a very nice thing that everyone likes")
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: a very nice thing that everyone likes
+
+julia> m(d1)
+0.5183109f0
+
+julia> d = StringDocument("a horrible thing that everyone hates")
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: a horrible thing that everyone hates
+
+julia> m(d2)
+0.47193584f0
+
+```
diff --git a/docs/src/tagging.md b/docs/src/tagging.md
new file mode 100644
index 0000000..90d85cf
--- /dev/null
+++ b/docs/src/tagging.md
@@ -0,0 +1,237 @@
+## Tagging_schemes
+
+There are many tagging schemes used for sequence labelling.
+TextAnalysis currently offers functions for conversion between these tagging format.
+
+*   BIO1
+*   BIO2
+*   BIOES
+
+```julia
+julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"]
+
+julia> tag_scheme!(tags, "BIO1", "BIOES")
+
+julia> tags
+8-element Array{String,1}:
+ "S-LOC"
+ "O"
+ "S-PER"
+ "B-MISC"
+ "E-MISC"
+ "B-PER"
+ "I-PER"
+ "E-PER"
+```
+
+## Parts of Speech Tagging
+
+This package provides with two different Part of Speech Tagger.
+
+## Average Perceptron Part of Speech Tagger
+
+This tagger can be used to find the POS tag of a word or token in a given sentence. It is a based on `Average Perceptron Algorithm`.
+The model can be trained from scratch and weights are saved in specified location.
+The pretrained model can also be loaded and can be used directly to predict tags.
+
+### To train model:
+```julia
+julia> tagger = PerceptronTagger(false) #we can use tagger = PerceptronTagger()
+julia> fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]])
+iteration : 1
+iteration : 2
+iteration : 3
+iteration : 4
+iteration : 5
+```
+
+### To load pretrained model:
+```julia
+julia> tagger = PerceptronTagger(true)
+loaded successfully
+PerceptronTagger(AveragePerceptron(Set(Any["JJS", "NNP_VBZ", "NN_NNS", "CC", "NNP_NNS", "EX", "NNP_TO", "VBD_DT", "LS", ("Council", "NNP")  …  "NNPS", "NNP_LS", "VB", "NNS_NN", "NNP_SYM", "VBZ", "VBZ_JJ", "UH", "SYM", "NNP_NN", "CD"]), Dict{Any,Any}("i+2 word wetlands"=>Dict{Any,Any}("NNS"=>0.0,"JJ"=>0.0,"NN"=>0.0),"i-1 tag+i word NNP basic"=>Dict{Any,Any}("JJ"=>0.0,"IN"=>0.0),"i-1 tag+i word DT chloride"=>Dict{Any,Any}("JJ"=>0.0,"NN"=>0.0),"i-1 tag+i word NN choo"=>Dict{Any,Any}("NNP"=>0.0,"NN"=>0.0),"i+1 word antarctica"=>Dict{Any,Any}("FW"=>0.0,"NN"=>0.0),"i-1 tag+i word -START- appendix"=>Dict{Any,Any}("NNP"=>0.0,"NNPS"=>0.0,"NN"=>0.0),"i-1 word wahoo"=>Dict{Any,Any}("JJ"=>0.0,"VBD"=>0.0),"i-1 tag+i word DT children's"=>Dict{Any,Any}("NNS"=>0.0,"NN"=>0.0),"i word dnipropetrovsk"=>Dict{Any,Any}("NNP"=>0.003,"NN"=>-0.003),"i suffix hla"=>Dict{Any,Any}("JJ"=>0.0,"NN"=>0.0)…), DefaultDict{Any,Any,Int64}(), DefaultDict{Any,Any,Int64}(), 1, ["-START-", "-START2-"]), Dict{Any,Any}("is"=>"VBZ","at"=>"IN","a"=>"DT","and"=>"CC","for"=>"IN","by"=>"IN","Retrieved"=>"VBN","was"=>"VBD","He"=>"PRP","in"=>"IN"…), Set(Any["JJS", "NNP_VBZ", "NN_NNS", "CC", "NNP_NNS", "EX", "NNP_TO", "VBD_DT", "LS", ("Council", "NNP")  …  "NNPS", "NNP_LS", "VB", "NNS_NN", "NNP_SYM", "VBZ", "VBZ_JJ", "UH", "SYM", "NNP_NN", "CD"]), ["-START-", "-START2-"], ["-END-", "-END2-"], Any[])
+```
+
+### To predict tags:
+
+The perceptron tagger can predict tags over various document types-
+
+    predict(tagger, sentence::String)
+    predict(tagger, Tokens::Array{String, 1})
+    predict(tagger, sd::StringDocument)
+    predict(tagger, fd::FileDocument)
+    predict(tagger, td::TokenDocument)
+
+This can also be done by -
+    tagger(input)
+
+
+```julia
+julia> predict(tagger, ["today", "is"])
+2-element Array{Any,1}:
+ ("today", "NN")
+ ("is", "VBZ")
+
+julia> tagger(["today", "is"])
+2-element Array{Any,1}:
+ ("today", "NN")
+ ("is", "VBZ")
+```
+
+`PerceptronTagger(load::Bool)`
+
+* load      = Boolean argument if `true` then pretrained model is loaded
+
+`fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String, nr_iter::Integer)`
+
+* self      = `PerceptronTagger` object
+* sentences = `Vector` of `Vector` of `Tuple` of pair of word or token and its POS tag [see above example]
+* save_loc  = location of file to save the trained weights
+* nr_iter   = Number of iterations to pass the `sentences` to train the model ( default 5)
+
+`predict(self::PerceptronTagger, tokens)`
+
+* self      = PerceptronTagger
+* tokens    = `Vector` of words or tokens for which to predict tags
+
+## Neural Model for Part of Speech tagging using LSTMs, CNN and CRF
+
+The API provided is a pretrained model for tagging Part of Speech.
+The current model tags all the POS Tagging is done based on [convention used in Penn Treebank](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html), with 36 different Part of Speech tags excludes punctuation.
+
+To use the API, we first load the model weights into an instance of tagger.
+The function also accepts the path of model_weights and model_dicts (for character and word embeddings)
+
+    PoSTagger()
+    PoSTagger(dicts_path, weights_path)
+
+```julia
+julia> pos = PoSTagger()
+
+```
+
+!!! note
+    When you call `PoSTagger()` for the first time, the package will request permission for download the `Model_dicts` and `Model_weights`. Upon downloading, these are store locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again.
+
+Once we create an instance, we can call it to tag a String (sentence), sequence of tokens, `AbstractDocument` or `Corpus`.
+
+    (pos::PoSTagger)(sentence::String)
+    (pos::PoSTagger)(tokens::Array{String, 1})
+    (pos::PoSTagger)(sd::StringDocument)
+    (pos::PoSTagger)(fd::FileDocument)
+    (pos::PoSTagger)(td::TokenDocument)
+    (pos::PoSTagger)(crps::Corpus)
+
+```julia
+
+julia> sentence = "This package is maintained by John Doe."
+"This package is maintained by John Doe."
+
+julia> tags = pos(sentence)
+8-element Array{String,1}:
+ "DT"
+ "NN"
+ "VBZ"
+ "VBN"
+ "IN"
+ "NNP"
+ "NNP"
+ "."
+
+```
+
+The API tokenizes the input sentences via the default tokenizer provided by `WordTokenizers`, this currently being set to the multilingual `TokTok Tokenizer.`
+
+```
+
+julia> using WordTokenizers
+
+julia> collect(zip(WordTokenizers.tokenize(sentence), tags))
+8-element Array{Tuple{String,String},1}:
+ ("This", "DT")
+ ("package", "NN")
+ ("is", "VBZ")
+ ("maintained", "VBN")
+ ("by", "IN")
+ ("John", "NNP")
+ ("Doe", "NNP")
+ (".", ".")
+
+```
+
+For tagging a multisentence text or document, once can use `split_sentences` from `WordTokenizers.jl` package and run the pos model on each.
+
+```julia
+julia> sentences = "Rabinov is winding up his term as ambassador. He will be replaced by Eliahu Ben-Elissar, a former Israeli envoy to Egypt and right-wing Likud party politiian." # Sentence taken from CoNLL 2003 Dataset
+
+julia> splitted_sents = WordTokenizers.split_sentences(sentences)
+
+julia> tag_sequences = pos.(splitted_sents)
+2-element Array{Array{String,1},1}:
+ ["NNP", "VBZ", "VBG", "RP", "PRP\$", "NN", "IN", "NN", "."]
+ ["PRP", "MD", "VB", "VBN", "IN", "NNP", "NNP", ",", "DT", "JJ", "JJ", "NN", "TO", "NNP", "CC", "JJ", "NNP", "NNP", "NNP", "."]
+
+julia> zipped = [collect(zip(tag_sequences[i], WordTokenizers.tokenize(splitted_sents[i]))) for i in eachindex(splitted_sents)]
+
+julia> zipped[1]
+9-element Array{Tuple{String,String},1}:
+ ("NNP", "Rabinov")
+ ("VBZ", "is")
+ ("VBG", "winding")
+ ("RP", "up")
+ ("PRP\$", "his")
+ ("NN", "term")
+ ("IN", "as")
+ ("NN", "ambassador")
+ (".", ".")
+
+julia> zipped[2]
+20-element Array{Tuple{String,String},1}:
+ ("PRP", "He")
+ ("MD", "will")
+ ("VB", "be")
+ ("VBN", "replaced")
+ ("IN", "by")
+ ("NNP", "Eliahu")
+ ("NNP", "Ben-Elissar")
+ (",", ",")
+ ("DT", "a")
+ ("JJ", "former")
+ ("JJ", "Israeli")
+ ("NN", "envoy")
+ ("TO", "to")
+ ("NNP", "Egypt")
+ ("CC", "and")
+ ("JJ", "right-wing")
+ ("NNP", "Likud")
+ ("NNP", "party")
+ ("NNP", "politiian")
+ (".", ".")
+
+```
+
+Since the tagging the Part of Speech is done on sentence level,
+the text of `AbstractDocument` is sentence_tokenized and then labelled for over sentence.
+However is not possible for `NGramDocument` as text cannot be recreated.
+For `TokenDocument`, text is approximated for splitting into sentences, hence the following throws a warning when tagging the `Corpus`.
+
+```julia
+
+julia> crps = Corpus([StringDocument("We aRE vErY ClOSE tO ThE HEaDQuarTeRS."), TokenDocument("this is Bangalore.")])
+A Corpus with 2 documents:
+ * 1 StringDocument's
+ * 0 FileDocument's
+ * 1 TokenDocument's
+ * 0 NGramDocument's
+
+Corpus's lexicon contains 0 tokens
+Corpus's index contains 0 tokens
+
+julia> pos(crps)
+┌ Warning: TokenDocument's can only approximate the original text
+└ @ TextAnalysis ~/.julia/dev/TextAnalysis/src/document.jl:220
+2-element Array{Array{Array{String,1},1},1}:
+ [["PRP", "VBP", "RB", "JJ", "TO", "DT", "NN", "."]]
+ [["DT", "VBZ", "NNP", "."]]
+
+```
diff --git a/test/runtests.jl b/test/runtests.jl
index f28f076..2738bfa 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,5 +7,6 @@ println("Running tests:")
 include("crf.jl")
 include("ner.jl")
 include("pos.jl")
+include("sentiment.jl")
 include("averagePerceptronTagger.jl")
 include("ulmfit.jl")

From 4d9976d13983f217911fdfdf7686c113fd497f68 Mon Sep 17 00:00:00 2001
From: Adarshkumar712 <adarshkumar712.ak@gmail.com>
Date: Mon, 24 May 2021 05:22:55 -0700
Subject: [PATCH 15/23] Correction in make.jl

---
 docs/make.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index a2f72fd..64de323 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -10,7 +10,7 @@ makedocs(
         "Conditional Random Fields" => "crf.md",
         "ULMFiT" => "ULMFiT.md",
         "Named Entity Recognition" => "ner.md",
-        "Tagging Schemes" => "tagging.md.md",
+        "Tagging Schemes" => "tagging.md",
         "Sentiment Analyzer" => "sentiment.md",
         "API References" => "APIReference.md"
     ],

From 4f3481b3110c9d2270432856780dca32114b27af Mon Sep 17 00:00:00 2001
From: Adarshkumar712 <adarshkumar712.ak@gmail.com>
Date: Mon, 24 May 2021 06:19:55 -0700
Subject: [PATCH 16/23] Update docs of crf and ULMFiT

---
 docs/src/ULMFiT.md | 216 +++++++++++++++++++++++----------------------
 docs/src/crf.md    |  31 +++----
 2 files changed, 125 insertions(+), 122 deletions(-)

diff --git a/docs/src/ULMFiT.md b/docs/src/ULMFiT.md
index 332e2fd..89622d1 100644
--- a/docs/src/ULMFiT.md
+++ b/docs/src/ULMFiT.md
@@ -18,37 +18,38 @@ Default data loaders are provided in the `data_loaders.jl`:
 
 In this step, Language Model will learn the general properties of the Language. To train the model we need a general domain corpus like WikiText-103. For training, a `generator` function is provided to create a `Channel` which will give mini-batch in every call. After pre-processing the corpus, the tokenized corpus is given as input to the generator function and the Channel can be created like so:
 ```julia
-julia> loader = Channel(x -> generator(x, corpus; batchsize=4, bptt=10))
-Channel{Any}(sz_max:0,sz_curr:1)
+julia> loader = ULMFiT.imdb_fine_tune_data(4, 10) # batchsize=4, bptt=10
+Channel{Any}(0) (1 item available)
 
 julia> max_batches = take!(loader) # this is the first call to the loader
 
 # These are the subsequent calls in pairs for X and Y
-julia> X = take!(Loaders)
- 10-element Array{Array{Any,1},1}:
- ["senjō", ",", "indicated", "after"]   
- ["no", "he", ",", "two"]               
- ["valkyria", "sent", "\"", "games"]    
- ["3", "a", "i", ","]                   
- [":", "formal", "am", "making"]        
- ["<unk>", "demand", "to", "a"]         
- ["chronicles", "for", "some", "start"]
- ["(", "surrender", "extent", "against"]
- ["japanese", "of", "influenced", "the"]
- [":", "the", "by", "vancouver"]
-
-julia> Y = take!(gen)
-10-element Array{Array{Any,1},1}:
-["no", "he", ",", "two"]                    
-["valkyria", "sent", "\"", "games"]         
-["3", "a", "i", ","]                        
-[":", "formal", "am", "making"]             
-["<unk>", "demand", "to", "a"]              
-["chronicles", "for", "some", "start"]      
-["(", "surrender", "extent", "against"]     
-["japanese", "of", "influenced", "the"]     
-[":", "the", "by", "vancouver"]             
-["戦場のヴァルキュリア", "arsenal", "them", "canucks"]
+julia> X = take!(loader)
+10-element Vector{Vector{Any}}:
+ ["i", "transparent", "it", "were"]
+ ["admit", "villain", "immediately", "all"]
+ [",", "who", "as", "first"]
+ ["the", "talks", "she", "rate"]
+ ["great", "like", "is", "."]
+ ["majority", "mortimer", "on", "even"]
+ ["of", "snerd", "for", "veda"]
+ ["films", "and", "a", "ann"]
+ ["released", "has", "few", "borg"]
+ ["before", "an", "seconds", "in"]
+
+julia> Y = take!(loader)
+10-element Vector{Vector{Any}}:
+ ["admit", "villain", "immediately", "all"]
+ [",", "who", "as", "first"]
+ ["the", "talks", "she", "rate"]
+ ["great", "like", "is", "."]
+ ["majority", "mortimer", "on", "even"]
+ ["of", "snerd", "for", "veda"]
+ ["films", "and", "a", "ann"]
+ ["released", "has", "few", "borg"]
+ ["before", "an", "seconds", "in"]
+ ["say", "office", ",", "a"]
+
 ```
 Note that at the first call to this `Channel` the output will be maximum number of batches which it can give. Two calls to this `Channel` completed one batch, that is, it doesnot give `X` and `Y` both together in one call, two calls are needed, one first `X` is given out and in second `Y`. Also, to understand what are `batchsize` and `bptt`, refer this [blog](https://nextjournal.com/ComputerMaestro/jsoc19-practical-implementation-of-ulmfit-in-julia-2).
 
@@ -199,24 +200,24 @@ This is basically a modification to the original LSTM layer. The layer uses [Dro
 
 ```julia
 # maskWi and maskWh are drop masks for Wi and Wh weights
-julia> fieldnames(WeightDroppedLSTMCell)
+julia> fieldnames(ULMFiT.WeightDroppedLSTMCell)
 (:Wi, :Wh, :b, :h, :c, :p, :maskWi, :maskWh, :active)
 
 # To deine a layer with 4 input size and 5 output size and 0.3 dropping probability
-julia> wd = WeightDroppedLSTM(4, 5, 0.3);
+julia> wd = ULMFiT.WeightDroppedLSTM(4, 5, 0.3);
 
 # Pass
 julia> x = rand(4);
 julia> h = wd(x)
-Tracked 5-element Array{Float64,1}:
-  0.06149460838123775
- -0.06028818475111407
-  0.07400426274491535
- -0.20671647527394219
- -0.00678279380721769
+5×1 Matrix{Float64}:
+  0.17602923394922002
+  0.08615001440875035
+  0.015924513976372016
+  0.10526862977034518
+ -0.04417581280319146
 
 # To reset_masks!
-julia> reset_masks!(wd)
+julia> ULMFiT.reset_masks!(wd)
 ```
 
 ### Averaged-SGD LSTM (AWD_LSTM)
@@ -226,63 +227,63 @@ This is a regular LSTM layer with Variational DropConnect and weights averaging
 ```julia
 # `accum` field is used to store the sum of weights for every iteration after trigger
 # to get average of the weights for every subsequent iteration
-julia> fieldnames(AWD_LSTM)
+julia> fieldnames(ULMFiT.AWD_LSTM)
 (:layer, :T, :accum)
 
-julia> awd = AWD_LSTM(3, 4, 0.5)
+julia> awd = ULMFiT.AWD_LSTM(3, 4, 0.5)
 
 # Setting trigger iteration
-julia> set_trigger!(1000, awd)
+julia> ULMFiT.set_trigger!(1000, awd)
 julia> awd.T
 1000
 
 # Pass
-julia> x = rand(3)
+julia> x = rand(3);
 julia> h = awd(x)
-Tracked 4-element Array{Float64,1}:
- -0.0751824486756288
- -0.3061227967356536
- -0.030079860137667995
- -0.09833401074779546
+4×1 Matrix{Float64}:
+  0.15229648590284084
+ -0.05929450272853615
+ -0.06110043118692251
+  0.15302430271141032
 
  # Resetting drop masks
- julia> awd.layer.cell.maskWi
- 16×3 Array{Float32,2}:
- 0.0  2.0  2.0
- 2.0  2.0  2.0
+julia> awd.layer.cell.maskWi
+16×3 Matrix{Float32}:
+ 0.0  0.0  0.0
+ 2.0  0.0  0.0
  0.0  2.0  0.0
- 0.0  0.0  2.0
- 0.0  0.0  2.0
- 2.0  2.0  2.0
+ 0.0  0.0  0.0
  2.0  2.0  2.0
- 0.0  2.0  2.0
  0.0  2.0  0.0
  2.0  0.0  2.0
+ 2.0  2.0  2.0
+ 2.0  0.0  0.0
  0.0  0.0  2.0
- 0.0  2.0  2.0
+ 2.0  0.0  0.0
  2.0  0.0  2.0
  0.0  2.0  0.0
  0.0  2.0  0.0
- 2.0  0.0  2.0
+ 2.0  2.0  2.0
+ 2.0  2.0  2.0
 
- julia> reset_masks!(awd)
- julia> awd.layer.cell.maskWi
- 16×3 Array{Float32,2}:
+julia> ULMFiT.reset_masks!(awd)
+julia> awd.layer.cell.maskWi
+16×3 Matrix{Float32}:
  0.0  2.0  0.0
- 0.0  0.0  0.0
- 2.0  0.0  0.0
  0.0  2.0  0.0
+ 0.0  0.0  0.0
  2.0  2.0  0.0
  2.0  2.0  2.0
- 2.0  2.0  0.0
- 2.0  2.0  0.0
  2.0  2.0  2.0
+ 0.0  2.0  0.0
+ 2.0  2.0  0.0
+ 2.0  0.0  2.0
  0.0  0.0  2.0
  2.0  0.0  0.0
  2.0  2.0  2.0
- 2.0  2.0  2.0
  0.0  0.0  2.0
- 0.0  2.0  0.0
+ 0.0  2.0  2.0
+ 2.0  0.0  2.0
  0.0  0.0  2.0
 ```
 
@@ -291,33 +292,34 @@ Tracked 4-element Array{Float64,1}:
 This layer applis Variational-DropOut, which is, using same dropout mask till it is not specified to change or till a pass is over. This dropout is useful for recurrent layers since these layers perform better if same mask is used for all time-steps (pass) instead of using different for every timestep. [Refer [this](https://arxiv.org/pdf/1506.02557.pdf) paper for more details]. This layer saves the masks after generation till it is not specified to change. To change the mask use `reset_masks!` function.
 
 ```julia
-julia> vd = VarDrop(0.5)
-VarDrop{Float64}(0.5, Array{Float32}(0,0), true, true)
+julia> vd = ULMFiT.VarDrop(0.5)
+VarDrop{Float64}(0.5, Matrix{Float32}(undef, 0, 0), true, true)
 
 # No mask generation will nothing is passed
 julia> vd.mask
-0×0 Array{Float32,2}
+0×0 Matrix{Float32}
 
 julia> x = rand(4,5)
-4×5 Array{Float64,2}:
- 0.480531  0.556341   0.228134  0.439411    0.137296
- 0.541459  0.118603   0.448941  0.568478    0.0440091
- 0.491735  0.55232    0.857768  0.729287    0.842753
- 0.33523   0.0378036  0.491757  0.00710462  0.374096
-
- julia> x = vd(x)
- 4×5 Array{Float64,2}:
- 0.961062  1.11268    0.0       0.0        0.274592
- 1.08292   0.0        0.897881  0.0        0.0880182
- 0.98347   0.0        0.0       1.45857    1.68551
- 0.67046   0.0756071  0.983514  0.0142092  0.0
-
- julia> vd.mask
- 4×5 Array{Float64,2}:
- 2.0  2.0  0.0  0.0  2.0
- 2.0  0.0  2.0  0.0  2.0
- 2.0  0.0  0.0  2.0  2.0
- 2.0  2.0  2.0  2.0  0.0
+4×5 Matrix{Float64}:
+ 0.383492  0.914917  0.616324  0.940116  0.526015
+ 0.286494  0.35078   0.320465  0.334261  0.295965
+ 0.232206  0.26289   0.940569  0.23259   0.675406
+ 0.152903  0.934304  0.125803  0.727792  0.239359
+
+julia> x = vd(x)
+4×5 Matrix{Float64}:
+ 0.0       0.0  0.0      1.88023   1.05203
+ 0.0       0.0  0.64093  0.668522  0.591929
+ 0.464413  0.0  1.88114  0.0       0.0
+ 0.0       0.0  0.0      0.0       0.478717
+
+julia> vd.mask
+4×5 Matrix{Float64}:
+ 0.0  0.0  0.0  2.0  2.0
+ 0.0  0.0  2.0  2.0  2.0
+ 2.0  0.0  2.0  0.0  0.0
+ 0.0  0.0  0.0  0.0  2.0
+
 ```
 
 ### Dropped Embeddings (DroppedEmbeddings)
@@ -325,35 +327,35 @@ julia> x = rand(4,5)
 This layer is an embedding layer which can work in two ways either to give embeddings Vectors for the given indices of words in vocabulary or can be used to get probability distribution for all the words of vocabulary with softmax layer, which is also called as weight-tying. Here, it can be used to tie weights of the embedding layer and the last softmax layer. In addition to this, it also dropped embeddings for words randomly for given probability of dropping, in other words, it puts whole embedding vector of randomly selects to vector of zeros. Here, the mask used for the dropping posses variational property, that is, it cannot be changed till it is not specified to change or generate a new drop mask. `reset_masks!` should be used to reset the mask.
 
 ```julia
-julia> fieldnames(DroppedEmbeddings)
+julia> fieldnames(ULMFiT.DroppedEmbeddings)
 (:emb, :p, :mask, :active)
 
-julia> de = DroppedEmbeddings(5, 2, 0.3)
+julia> de = ULMFiT.DroppedEmbeddings(5, 2, 0.3);
 
 # Pass
-julia> x = [4,2,1]
+julia> x = [4,2,1];
 julia> embeddings = de(x)
-Tracked 2×3 LinearAlgebra.Transpose{Float32,Array{Float32,2}}:
- 0.86327    0.537614  -0.0
- 0.152131  -0.541008  -0.0
+2×3 transpose(::Matrix{Float32}) with eltype Float32:
+  0.363157  -0.0246867  -0.332342
+ -0.553211  -0.594884    0.184288
 
- julia> de.mask
- 5-element Array{Float32,1}:
- 0.0
+julia> de.mask
+5-element Vector{Float32}:
+ 1.4285715
  1.4285715
  1.4285715
  1.4285715
  1.4285715
 
- # reset mask
- julia> reset_masks!(de)
- julia> de.mask
- 5-element Array{Float32,1}:
- 0.0
+# reset mask
+julia> reset_masks!(de)
+julia> de.mask
+5-element Vector{Float32}:
  1.4285715
  1.4285715
- 0.0
  1.4285715
+ 0.0
+ 0.0
 ```
 
 ### Concat-Pooled Dense layer
@@ -362,13 +364,13 @@ This is a simple modification to the original `Dense` layer for recurrent networ
 
 ```julia
 # The first argument is the length of the output Vector of the preceding RNN layer to this layer. Also, by default if uses identity activation, it can be changed by giving desired activaiton as the third argument
-julia> pd = PooledDense(4, 3)
+julia> pd = ULMFiT.PooledDense(4, 3)
 
 # Pass
-julia> X = [rand(4), rand(4), rand(4)]
+julia> X = [rand(4), rand(4), rand(4)];
 julia> pd(X)
-Tracked 3×1 Array{Float64,2}:
- -2.2106991143006036
- -0.9560163708455404
- -0.4770649645417375
+3×1 Matrix{Float64}:
+ -1.3679283360573462
+  1.1115990254044759
+ -0.27398355913859046
 ```
diff --git a/docs/src/crf.md b/docs/src/crf.md
index 19f958d..af93cbf 100644
--- a/docs/src/crf.md
+++ b/docs/src/crf.md
@@ -6,18 +6,19 @@ Let us first load the dependencies-
 
     using Flux
     using Flux: onehot, train!, Params, gradient, LSTM, Dense, reset!
-    using TextAnalysis: CRF, viterbi_decode, crf_loss
+    using TextModels: CRF, viterbi_decode, crf_loss
 
 Conditional Random Field layer is essentially like a softmax that operates on the top most layer.
 
 Let us suppose the following input sequence to the CRF with `NUM_LABELS = 2`
 
 ```julia
+julia> NUM_LABELS = 2
 julia> SEQUENCE_LENGTH = 2 # CRFs can handle variable length inputs sequences
-julia> input_seq = [rand(NUM_LABELS + 2) for i in 1:SEQUENCE_LENGTH] # NUM_LABELS + 2, where two extra features correspond to the :START and :END label.
-2-element Array{Array{Float64,1},1}:
- [0.523462, 0.455434, 0.274347, 0.755279]
- [0.610991, 0.315381, 0.0863632, 0.693031]
+julia> input_seq = [Float32.(rand(NUM_LABELS + 2)) for i in 1:SEQUENCE_LENGTH] # NUM_LABELS + 2, where two extra features correspond to the :START and :END label.
+2-element Vector{Vector{Float32}}:
+ [0.5114323, 0.5355139, 0.4011792, 0.56359255]
+ [0.22925346, 0.21232551, 0.77616125, 0.41560093]
 
 ```
 
@@ -56,16 +57,16 @@ julia> label_seq3 = [onehot(2, 1:2), onehot(1, 1:2)]
 julia> label_seq4 = [onehot(2, 1:2), onehot(2, 1:2)]
 
 julia> crf_loss(c, input_seq, label_seq1, init_α)
-1.9206894963901504 (tracked)
+1.33554f0
 
 julia> crf_loss(c, input_seq, label_seq2, init_α)
-1.4972745472075206 (tracked)
+1.2327178f0
 
 julia> crf_loss(c, input_seq, label_seq3, init_α)
-1.543210471592448 (tracked)
+1.3454239f0
 
 julia> crf_loss(c, input_seq, label_seq4, init_α)
-0.876923329893466 (tracked)
+1.6871009f0
 
 ```
 
@@ -75,9 +76,9 @@ We can decode this using Viterbi Decode.
 
 ```julia
 julia> viterbi_decode(c, input_seq, init_α) # Gives the label_sequence with least loss
-2-element Array{Flux.OneHotVector,1}:
- [false, true]
- [false, true]
+2-element Vector{Flux.OneHotArray{UInt32, 2, 0, 1, UInt32}}:
+ [1, 0]
+ [0, 1]
 
 ```
 
@@ -96,7 +97,7 @@ CRFs smoothly work over Flux layers-
 julia> NUM_FEATURES = 20
 
 julia> input_seq = [rand(NUM_FEATURES) for i in 1:SEQUENCE_LENGTH]
-2-element Array{Array{Float64,1},1}:
+2-element Vector{Vector{Float32}}:
  [0.948219, 0.719964, 0.352734, 0.0677656, 0.570564, 0.187673, 0.525125, 0.787807, 0.262452, 0.472472, 0.573259, 0.643369, 0.00592054, 0.945258, 0.951466, 0.323156, 0.679573, 0.663285, 0.218595, 0.152846]
  [0.433295, 0.11998, 0.99615, 0.530107, 0.188887, 0.897213, 0.993726, 0.0799431, 0.953333, 0.941808, 0.982638, 0.0919345, 0.27504, 0.894169, 0.66818, 0.449537, 0.93063, 0.384957, 0.415114, 0.212203]
 
@@ -105,7 +106,7 @@ julia> m1 = Dense(NUM_FEATURES, NUM_LABELS + 2)
 julia> loss1(input_seq, label_seq) = crf_loss(c, m1.(input_seq), label_seq, init_α) # loss for model m1
 
 julia> loss1(input_seq,  [onehot(1, 1:2), onehot(1, 1:2)])
-4.6620379898687485 (tracked)
+4.6620379898687485
 
 ```
 
@@ -124,7 +125,7 @@ julia> m2(x) = dense_out.(lstm.(x))
 julia> loss2(input_seq, label_seq) = crf_loss(c, m2(input_seq), label_seq, init_α) # loss for model m2
 
 julia> loss2(input_seq,  [onehot(1, 1:2), onehot(1, 1:2)])
-1.6501050910529504 (tracked)
+1.6501050910529504
 
 julia> reset!(lstm)
 ```

From 24fa0d8cc2a032ea8e3feca8c346b6c2eb53b993 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 24 May 2021 14:51:26 +0000
Subject: [PATCH 17/23] CompatHelper: bump compat for "BSON" to "0.3"

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 4687488..df48a3a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -24,7 +24,7 @@ Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
 
 [compat]
-BSON = "0.2.5"
+BSON = "0.2.5, 0.3"
 DataDeps = "0.7"
 DataStructures = "0.17, 0.18"
 Flux = "0.9"

From 078e3888e0235b3ce16839f24aaa921399094698 Mon Sep 17 00:00:00 2001
From: Adarshkumar712 <adarshkumar712.ak@gmail.com>
Date: Thu, 1 Jul 2021 00:55:43 -0700
Subject: [PATCH 18/23] Minor Corrections

---
 src/ULMFiT/custom_layers.jl | 12 +++++++++---
 src/ULMFiT/fine_tune_lm.jl  |  2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/ULMFiT/custom_layers.jl b/src/ULMFiT/custom_layers.jl
index c0275a6..d83c43c 100644
--- a/src/ULMFiT/custom_layers.jl
+++ b/src/ULMFiT/custom_layers.jl
@@ -63,7 +63,7 @@ function WeightDroppedLSTMCell(in::Integer, out::Integer, p::Float64=0.0;
         init(out*4, in),
         init(out*4, out),
         init(out*4),
-        reshape(zeros(Float32, out),out, 1),
+        reshape(zeros(Float32, out), out, 1),
         reshape(zeros(Float32, out), out, 1),
         p,
         drop_mask((out*4, in), p),
@@ -112,9 +112,15 @@ function WeightDroppedLSTM(a...; kw...)
     return Flux.Recur(cell, hidden)
 end
 
-# over definition for reset! to work with pretrained model
+"""
+    reset!(m)
+
+Resets the h, c parameters of the LSTM Cell.
+    
+For more refer [`Flux.reset`](@ref https://fluxml.ai/Flux.jl/stable/models/layers/#Flux.reset!)
+"""
 function reset!(m)
-    try
+    try		# to accomodate the definition in previously trained Language Model
         (m.state = (m.cell.h, m.cell.c))
     catch
     	Flux.reset!(m)
diff --git a/src/ULMFiT/fine_tune_lm.jl b/src/ULMFiT/fine_tune_lm.jl
index b2e7261..22a08d3 100644
--- a/src/ULMFiT/fine_tune_lm.jl
+++ b/src/ULMFiT/fine_tune_lm.jl
@@ -47,7 +47,7 @@ end
             epochs::Integer=1, checkpoint_itvl::Integer=5000)
 
 This function contains main training loops for fine-tuning the language model.
-To use this funciton, an instance of LanguageModel and a data loader is needed.
+To use this function, an instance of LanguageModel and a data loader is needed.
 Read the docs for more info about arguments
 """
 function fine_tune_lm!(lm=LanguageModel(), data_loader=imdb_fine_tune_data,

From 7b2b460899d2fa22213c2ae8296833a7e8b7ca27 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sun, 11 Jul 2021 15:39:27 +0000
Subject: [PATCH 19/23] CompatHelper: add new compat entry for "CUDA" at
 version "3"

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index e8a08dd..bbfe4e7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -25,6 +25,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 BSON = "0.3.3"
+CUDA = "3"
 DataStructures = "0.18.9"
 Flux = "0.12.2"
 JSON = "0.21.1"

From 53ee374f1df4818e0f3a8ecc813275bba90ac025 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sun, 11 Jul 2021 15:39:31 +0000
Subject: [PATCH 20/23] CompatHelper: add new compat entry for "DataDeps" at
 version "0.7"

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index e8a08dd..0580cde 100644
--- a/Project.toml
+++ b/Project.toml
@@ -25,6 +25,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 BSON = "0.3.3"
+DataDeps = "0.7"
 DataStructures = "0.18.9"
 Flux = "0.12.2"
 JSON = "0.21.1"

From 3b8b81c9359a9a8990332d35cf275595566ecf9a Mon Sep 17 00:00:00 2001
From: Konstantinos Samaras-Tsakiris <ksamtsak@gmail.com>
Date: Sun, 18 Jul 2021 13:22:54 +0200
Subject: [PATCH 21/23] Fix broken docs link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index afa0f57..4b63f6e 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ A Julia package for working with text.
 
 ## Introduction
 
-The TextModels package enhances the TextAnalysis package with end-user focussed, practical natural language models, typically based on neural networks (in this case, [Flux](https://fluxml.ai/)).  Please see the [documentation](https://juliatext.github.io/TextAnalysis.jl/latest) for more.
+The TextModels package enhances the TextAnalysis package with end-user focussed, practical natural language models, typically based on neural networks (in this case, [Flux](https://fluxml.ai/)).  Please see the [documentation](https://juliahub.com/docs/TextModels) for more.
 
 - **License** : [MIT License](https://github.com/JuliaText/TextAnalysis.jl/blob/master/LICENSE.md)
 

From fd2fd328241cf57b0a8a4c77caf30c285b30614d Mon Sep 17 00:00:00 2001
From: Avik Sengupta <avik@sengupta.net>
Date: Mon, 6 Dec 2021 00:06:21 +0000
Subject: [PATCH 22/23] CorpusLoader compat

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index 3e434c8..f83804a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -37,6 +37,7 @@ TextAnalysis = "0.7.3"
 WordTokenizers = "0.5.6"
 Zygote = "0.6.10"
 julia = "1.6"
+CorpusLoaders = "0.3"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

From 66ec1fe47c23966fe0b3e1d0bfa207963963cdb8 Mon Sep 17 00:00:00 2001
From: Avik Sengupta <avik@sengupta.net>
Date: Fri, 10 Dec 2021 18:29:44 -0500
Subject: [PATCH 23/23] Flux update

---
 Project.toml                    | 6 +++---
 src/sequence/sequence_models.jl | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index f83804a..bbe2f84 100644
--- a/Project.toml
+++ b/Project.toml
@@ -25,10 +25,11 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 BSON = "0.3.3"
-DataDeps = "0.7"
 CUDA = "3"
+CorpusLoaders = "0.3"
+DataDeps = "0.7"
 DataStructures = "0.18.9"
-Flux = "0.12.2"
+Flux = "0.12.8"
 JSON = "0.21.1"
 Languages = "0.4.3"
 NNlib = "0.7"
@@ -37,7 +38,6 @@ TextAnalysis = "0.7.3"
 WordTokenizers = "0.5.6"
 Zygote = "0.6.10"
 julia = "1.6"
-CorpusLoaders = "0.3"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/src/sequence/sequence_models.jl b/src/sequence/sequence_models.jl
index 8c8a6df..8b4a3a6 100644
--- a/src/sequence/sequence_models.jl
+++ b/src/sequence/sequence_models.jl
@@ -79,6 +79,8 @@ function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Wor
                       (1, 1), # stride
                       (0, 2), # pad
                       (1, 1), # dilation
+                      1 # groups
+
             )
 
     BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, conv1, W_Char_Embed, W_word_Embed,