diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..8e04faf
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,64 @@
+name: CI
+on:
+  push:
+    branches:
+      - master
+    tags: '*'
+  pull_request:
+jobs:
+  test:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
+    runs-on: ${{ matrix.os }}
+    continue-on-error: ${{ matrix.version == 'nightly' }}
+    strategy:
+      matrix:
+        version:
+          - '1.6'
+          - 'nightly'
+        os:
+          - ubuntu-latest
+          - macOS-latest
+          - windows-latest
+        arch:
+          - x86
+          - x64
+        exclude:
+          # Remove some configurations from the build matrix to reduce CI time.
+          # See https://github.com/marketplace/actions/setup-julia-environment
+          # MacOS not available on x86
+          - {os: 'macOS-latest', arch: 'x86'}
+          # Don't test on all versions
+          - {os: 'macOS-latest', version: '1.6'}
+          - {os: 'macOS-latest', version: 'nightly'}
+          - {os: 'windows-latest', version: '1.6'}
+          - {os: 'windows-latest', version: 'nightly'}
+          - {os: 'windows-latest', arch: 'x86'}
+          - {arch: 'x86', version: '1.6'}
+          - {arch: 'x86', version: 'nightly'}
+    steps:
+      - uses: actions/checkout@v1
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+      - uses: julia-actions/julia-buildpkg@latest
+      - uses: julia-actions/julia-runtest@latest
+        env:
+          DATADEPS_ALWAYS_ACCEPT: true
+        with:
+          coverage: false
+  docs:
+    name: Documentation
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v1
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: '1.6'
+      - run: julia --project=docs -e '
+          using Pkg;
+          Pkg.develop(PackageSpec(; path=pwd()));
+          Pkg.instantiate();'
+      - run: julia --project=docs docs/make.jl
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.travis.yml b/.travis.yml
index bf028c9..8e8320a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,17 +6,16 @@ os:
 env:
   - DATADEPS_ALWAYS_ACCEPT=true
 julia:
-  - 1.3
-  - 1
+  - 1.6
   - nightly
 matrix:
   allow_failures:
     - julia: nightly
   exclude:
     - os: osx
-      julia: 1.3
+      julia: 1.6
     - os: windows
-      julia: 1.3
+      julia: 1.6
     - os: osx
       julia: nightly
     - os: windows
diff --git a/Project.toml b/Project.toml
index 4687488..bbe2f84 100644
--- a/Project.toml
+++ b/Project.toml
@@ -2,40 +2,42 @@ name = "TextModels"
 uuid = "77b9cbda-2a23-51df-82a3-24144d1cd378"
 license = "MIT"
 desc = "Practical Neural Network based models for Natural Language Processing"
-version = "0.1.0"
+version = "0.1.1"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
-Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-BSON = "0.2.5"
+BSON = "0.3.3"
+CUDA = "3"
+CorpusLoaders = "0.3"
 DataDeps = "0.7"
-DataStructures = "0.17, 0.18"
-Flux = "0.9"
-JSON = "0.21"
-Languages = "0.4"
-NNlib = "0.6, 0.7"
-StatsBase = "0.33"
-TextAnalysis = "0.7"
-Tracker = "0.2"
-WordTokenizers = "0.5"
-julia = "1.3"
+DataStructures = "0.18.9"
+Flux = "0.12.8"
+JSON = "0.21.1"
+Languages = "0.4.3"
+NNlib = "0.7"
+StatsBase = "0.33.6"
+TextAnalysis = "0.7.3"
+WordTokenizers = "0.5.6"
+Zygote = "0.6.10"
+julia = "1.6"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/README.md b/README.md
index afa0f57..4b63f6e 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ A Julia package for working with text.
 
 ## Introduction
 
-The TextModels package enhances the TextAnalysis package with end-user focussed, practical natural language models, typically based on neural networks (in this case, [Flux](https://fluxml.ai/)).  Please see the [documentation](https://juliatext.github.io/TextAnalysis.jl/latest) for more.
+The TextModels package enhances the TextAnalysis package with end-user focussed, practical natural language models, typically based on neural networks (in this case, [Flux](https://fluxml.ai/)).  Please see the [documentation](https://juliahub.com/docs/TextModels) for more.
 
 - **License** : [MIT License](https://github.com/JuliaText/TextAnalysis.jl/blob/master/LICENSE.md)
 
diff --git a/docs/make.jl b/docs/make.jl
index 5876f79..ca1f9ea 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -2,14 +2,16 @@ using Documenter, TextModels
 
 makedocs(
     modules = [TextModels],
-    sitename = "TextAnalysis",
+    sitename = "TextModels",
     format = Documenter.HTML(
     ),
     pages = [
         "Home" => "index.md",
         "Conditional Random Fields" => "crf.md",
-        "Named Entity Recognition" => "ner.md",
         "ULMFiT" => "ULMFiT.md",
+        "Named Entity Recognition" => "ner.md",
+        "Tagging Schemes" => "tagging.md",
+        "Sentiment Analyzer" => "sentiment.md",
         "API References" => "APIReference.md"
     ],
 )
diff --git a/docs/src/ULMFiT.md b/docs/src/ULMFiT.md
index 332e2fd..89622d1 100644
--- a/docs/src/ULMFiT.md
+++ b/docs/src/ULMFiT.md
@@ -18,37 +18,38 @@ Default data loaders are provided in the `data_loaders.jl`:
 
 In this step, Language Model will learn the general properties of the Language. To train the model we need a general domain corpus like WikiText-103. For training, a `generator` function is provided to create a `Channel` which will give mini-batch in every call. After pre-processing the corpus, the tokenized corpus is given as input to the generator function and the Channel can be created like so:
 ```julia
-julia> loader = Channel(x -> generator(x, corpus; batchsize=4, bptt=10))
-Channel{Any}(sz_max:0,sz_curr:1)
+julia> loader = ULMFiT.imdb_fine_tune_data(4, 10) # batchsize=4, bptt=10
+Channel{Any}(0) (1 item available)
 
 julia> max_batches = take!(loader) # this is the first call to the loader
 
 # These are the subsequent calls in pairs for X and Y
-julia> X = take!(Loaders)
- 10-element Array{Array{Any,1},1}:
- ["senjō", ",", "indicated", "after"]   
- ["no", "he", ",", "two"]               
- ["valkyria", "sent", "\"", "games"]    
- ["3", "a", "i", ","]                   
- [":", "formal", "am", "making"]        
- ["<unk>", "demand", "to", "a"]         
- ["chronicles", "for", "some", "start"]
- ["(", "surrender", "extent", "against"]
- ["japanese", "of", "influenced", "the"]
- [":", "the", "by", "vancouver"]
-
-julia> Y = take!(gen)
-10-element Array{Array{Any,1},1}:
-["no", "he", ",", "two"]                    
-["valkyria", "sent", "\"", "games"]         
-["3", "a", "i", ","]                        
-[":", "formal", "am", "making"]             
-["<unk>", "demand", "to", "a"]              
-["chronicles", "for", "some", "start"]      
-["(", "surrender", "extent", "against"]     
-["japanese", "of", "influenced", "the"]     
-[":", "the", "by", "vancouver"]             
-["戦場のヴァルキュリア", "arsenal", "them", "canucks"]
+julia> X = take!(loader)
+10-element Vector{Vector{Any}}:
+ ["i", "transparent", "it", "were"]
+ ["admit", "villain", "immediately", "all"]
+ [",", "who", "as", "first"]
+ ["the", "talks", "she", "rate"]
+ ["great", "like", "is", "."]
+ ["majority", "mortimer", "on", "even"]
+ ["of", "snerd", "for", "veda"]
+ ["films", "and", "a", "ann"]
+ ["released", "has", "few", "borg"]
+ ["before", "an", "seconds", "in"]
+
+julia> Y = take!(loader)
+10-element Vector{Vector{Any}}:
+ ["admit", "villain", "immediately", "all"]
+ [",", "who", "as", "first"]
+ ["the", "talks", "she", "rate"]
+ ["great", "like", "is", "."]
+ ["majority", "mortimer", "on", "even"]
+ ["of", "snerd", "for", "veda"]
+ ["films", "and", "a", "ann"]
+ ["released", "has", "few", "borg"]
+ ["before", "an", "seconds", "in"]
+ ["say", "office", ",", "a"]
+
 ```
 Note that at the first call to this `Channel` the output will be maximum number of batches which it can give. Two calls to this `Channel` completed one batch, that is, it doesnot give `X` and `Y` both together in one call, two calls are needed, one first `X` is given out and in second `Y`. Also, to understand what are `batchsize` and `bptt`, refer this [blog](https://nextjournal.com/ComputerMaestro/jsoc19-practical-implementation-of-ulmfit-in-julia-2).
 
@@ -199,24 +200,24 @@ This is basically a modification to the original LSTM layer. The layer uses [Dro
 
 ```julia
 # maskWi and maskWh are drop masks for Wi and Wh weights
-julia> fieldnames(WeightDroppedLSTMCell)
+julia> fieldnames(ULMFiT.WeightDroppedLSTMCell)
 (:Wi, :Wh, :b, :h, :c, :p, :maskWi, :maskWh, :active)
 
 # To deine a layer with 4 input size and 5 output size and 0.3 dropping probability
-julia> wd = WeightDroppedLSTM(4, 5, 0.3);
+julia> wd = ULMFiT.WeightDroppedLSTM(4, 5, 0.3);
 
 # Pass
 julia> x = rand(4);
 julia> h = wd(x)
-Tracked 5-element Array{Float64,1}:
-  0.06149460838123775
- -0.06028818475111407
-  0.07400426274491535
- -0.20671647527394219
- -0.00678279380721769
+5×1 Matrix{Float64}:
+  0.17602923394922002
+  0.08615001440875035
+  0.015924513976372016
+  0.10526862977034518
+ -0.04417581280319146
 
 # To reset_masks!
-julia> reset_masks!(wd)
+julia> ULMFiT.reset_masks!(wd)
 ```
 
 ### Averaged-SGD LSTM (AWD_LSTM)
@@ -226,63 +227,63 @@ This is a regular LSTM layer with Variational DropConnect and weights averaging
 ```julia
 # `accum` field is used to store the sum of weights for every iteration after trigger
 # to get average of the weights for every subsequent iteration
-julia> fieldnames(AWD_LSTM)
+julia> fieldnames(ULMFiT.AWD_LSTM)
 (:layer, :T, :accum)
 
-julia> awd = AWD_LSTM(3, 4, 0.5)
+julia> awd = ULMFiT.AWD_LSTM(3, 4, 0.5)
 
 # Setting trigger iteration
-julia> set_trigger!(1000, awd)
+julia> ULMFiT.set_trigger!(1000, awd)
 julia> awd.T
 1000
 
 # Pass
-julia> x = rand(3)
+julia> x = rand(3);
 julia> h = awd(x)
-Tracked 4-element Array{Float64,1}:
- -0.0751824486756288
- -0.3061227967356536
- -0.030079860137667995
- -0.09833401074779546
+4×1 Matrix{Float64}:
+  0.15229648590284084
+ -0.05929450272853615
+ -0.06110043118692251
+  0.15302430271141032
 
  # Resetting drop masks
- julia> awd.layer.cell.maskWi
- 16×3 Array{Float32,2}:
- 0.0  2.0  2.0
- 2.0  2.0  2.0
+julia> awd.layer.cell.maskWi
+16×3 Matrix{Float32}:
+ 0.0  0.0  0.0
+ 2.0  0.0  0.0
  0.0  2.0  0.0
- 0.0  0.0  2.0
- 0.0  0.0  2.0
- 2.0  2.0  2.0
+ 0.0  0.0  0.0
  2.0  2.0  2.0
- 0.0  2.0  2.0
  0.0  2.0  0.0
  2.0  0.0  2.0
+ 2.0  2.0  2.0
+ 2.0  0.0  0.0
  0.0  0.0  2.0
- 0.0  2.0  2.0
+ 2.0  0.0  0.0
  2.0  0.0  2.0
  0.0  2.0  0.0
  0.0  2.0  0.0
- 2.0  0.0  2.0
+ 2.0  2.0  2.0
+ 2.0  2.0  2.0
 
- julia> reset_masks!(awd)
- julia> awd.layer.cell.maskWi
- 16×3 Array{Float32,2}:
+julia> ULMFiT.reset_masks!(awd)
+julia> awd.layer.cell.maskWi
+16×3 Matrix{Float32}:
  0.0  2.0  0.0
- 0.0  0.0  0.0
- 2.0  0.0  0.0
  0.0  2.0  0.0
+ 0.0  0.0  0.0
  2.0  2.0  0.0
  2.0  2.0  2.0
- 2.0  2.0  0.0
- 2.0  2.0  0.0
  2.0  2.0  2.0
+ 0.0  2.0  0.0
+ 2.0  2.0  0.0
+ 2.0  0.0  2.0
  0.0  0.0  2.0
  2.0  0.0  0.0
  2.0  2.0  2.0
- 2.0  2.0  2.0
  0.0  0.0  2.0
- 0.0  2.0  0.0
+ 0.0  2.0  2.0
+ 2.0  0.0  2.0
  0.0  0.0  2.0
 ```
 
@@ -291,33 +292,34 @@ Tracked 4-element Array{Float64,1}:
 This layer applis Variational-DropOut, which is, using same dropout mask till it is not specified to change or till a pass is over. This dropout is useful for recurrent layers since these layers perform better if same mask is used for all time-steps (pass) instead of using different for every timestep. [Refer [this](https://arxiv.org/pdf/1506.02557.pdf) paper for more details]. This layer saves the masks after generation till it is not specified to change. To change the mask use `reset_masks!` function.
 
 ```julia
-julia> vd = VarDrop(0.5)
-VarDrop{Float64}(0.5, Array{Float32}(0,0), true, true)
+julia> vd = ULMFiT.VarDrop(0.5)
+VarDrop{Float64}(0.5, Matrix{Float32}(undef, 0, 0), true, true)
 
 # No mask generation will nothing is passed
 julia> vd.mask
-0×0 Array{Float32,2}
+0×0 Matrix{Float32}
 
 julia> x = rand(4,5)
-4×5 Array{Float64,2}:
- 0.480531  0.556341   0.228134  0.439411    0.137296
- 0.541459  0.118603   0.448941  0.568478    0.0440091
- 0.491735  0.55232    0.857768  0.729287    0.842753
- 0.33523   0.0378036  0.491757  0.00710462  0.374096
-
- julia> x = vd(x)
- 4×5 Array{Float64,2}:
- 0.961062  1.11268    0.0       0.0        0.274592
- 1.08292   0.0        0.897881  0.0        0.0880182
- 0.98347   0.0        0.0       1.45857    1.68551
- 0.67046   0.0756071  0.983514  0.0142092  0.0
-
- julia> vd.mask
- 4×5 Array{Float64,2}:
- 2.0  2.0  0.0  0.0  2.0
- 2.0  0.0  2.0  0.0  2.0
- 2.0  0.0  0.0  2.0  2.0
- 2.0  2.0  2.0  2.0  0.0
+4×5 Matrix{Float64}:
+ 0.383492  0.914917  0.616324  0.940116  0.526015
+ 0.286494  0.35078   0.320465  0.334261  0.295965
+ 0.232206  0.26289   0.940569  0.23259   0.675406
+ 0.152903  0.934304  0.125803  0.727792  0.239359
+
+julia> x = vd(x)
+4×5 Matrix{Float64}:
+ 0.0       0.0  0.0      1.88023   1.05203
+ 0.0       0.0  0.64093  0.668522  0.591929
+ 0.464413  0.0  1.88114  0.0       0.0
+ 0.0       0.0  0.0      0.0       0.478717
+
+julia> vd.mask
+4×5 Matrix{Float64}:
+ 0.0  0.0  0.0  2.0  2.0
+ 0.0  0.0  2.0  2.0  2.0
+ 2.0  0.0  2.0  0.0  0.0
+ 0.0  0.0  0.0  0.0  2.0
+
 ```
 
 ### Dropped Embeddings (DroppedEmbeddings)
@@ -325,35 +327,35 @@ julia> x = rand(4,5)
 This layer is an embedding layer which can work in two ways either to give embeddings Vectors for the given indices of words in vocabulary or can be used to get probability distribution for all the words of vocabulary with softmax layer, which is also called as weight-tying. Here, it can be used to tie weights of the embedding layer and the last softmax layer. In addition to this, it also dropped embeddings for words randomly for given probability of dropping, in other words, it puts whole embedding vector of randomly selects to vector of zeros. Here, the mask used for the dropping posses variational property, that is, it cannot be changed till it is not specified to change or generate a new drop mask. `reset_masks!` should be used to reset the mask.
 
 ```julia
-julia> fieldnames(DroppedEmbeddings)
+julia> fieldnames(ULMFiT.DroppedEmbeddings)
 (:emb, :p, :mask, :active)
 
-julia> de = DroppedEmbeddings(5, 2, 0.3)
+julia> de = ULMFiT.DroppedEmbeddings(5, 2, 0.3);
 
 # Pass
-julia> x = [4,2,1]
+julia> x = [4,2,1];
 julia> embeddings = de(x)
-Tracked 2×3 LinearAlgebra.Transpose{Float32,Array{Float32,2}}:
- 0.86327    0.537614  -0.0
- 0.152131  -0.541008  -0.0
+2×3 transpose(::Matrix{Float32}) with eltype Float32:
+  0.363157  -0.0246867  -0.332342
+ -0.553211  -0.594884    0.184288
 
- julia> de.mask
- 5-element Array{Float32,1}:
- 0.0
+julia> de.mask
+5-element Vector{Float32}:
+ 1.4285715
  1.4285715
  1.4285715
  1.4285715
  1.4285715
 
- # reset mask
- julia> reset_masks!(de)
- julia> de.mask
- 5-element Array{Float32,1}:
- 0.0
+# reset mask
+julia> reset_masks!(de)
+julia> de.mask
+5-element Vector{Float32}:
  1.4285715
  1.4285715
- 0.0
  1.4285715
+ 0.0
+ 0.0
 ```
 
 ### Concat-Pooled Dense layer
@@ -362,13 +364,13 @@ This is a simple modification to the original `Dense` layer for recurrent networ
 
 ```julia
 # The first argument is the length of the output Vector of the preceding RNN layer to this layer. Also, by default if uses identity activation, it can be changed by giving desired activaiton as the third argument
-julia> pd = PooledDense(4, 3)
+julia> pd = ULMFiT.PooledDense(4, 3)
 
 # Pass
-julia> X = [rand(4), rand(4), rand(4)]
+julia> X = [rand(4), rand(4), rand(4)];
 julia> pd(X)
-Tracked 3×1 Array{Float64,2}:
- -2.2106991143006036
- -0.9560163708455404
- -0.4770649645417375
+3×1 Matrix{Float64}:
+ -1.3679283360573462
+  1.1115990254044759
+ -0.27398355913859046
 ```
diff --git a/docs/src/crf.md b/docs/src/crf.md
index 19f958d..af93cbf 100644
--- a/docs/src/crf.md
+++ b/docs/src/crf.md
@@ -6,18 +6,19 @@ Let us first load the dependencies-
 
     using Flux
     using Flux: onehot, train!, Params, gradient, LSTM, Dense, reset!
-    using TextAnalysis: CRF, viterbi_decode, crf_loss
+    using TextModels: CRF, viterbi_decode, crf_loss
 
 Conditional Random Field layer is essentially like a softmax that operates on the top most layer.
 
 Let us suppose the following input sequence to the CRF with `NUM_LABELS = 2`
 
 ```julia
+julia> NUM_LABELS = 2
 julia> SEQUENCE_LENGTH = 2 # CRFs can handle variable length inputs sequences
-julia> input_seq = [rand(NUM_LABELS + 2) for i in 1:SEQUENCE_LENGTH] # NUM_LABELS + 2, where two extra features correspond to the :START and :END label.
-2-element Array{Array{Float64,1},1}:
- [0.523462, 0.455434, 0.274347, 0.755279]
- [0.610991, 0.315381, 0.0863632, 0.693031]
+julia> input_seq = [Float32.(rand(NUM_LABELS + 2)) for i in 1:SEQUENCE_LENGTH] # NUM_LABELS + 2, where two extra features correspond to the :START and :END label.
+2-element Vector{Vector{Float32}}:
+ [0.5114323, 0.5355139, 0.4011792, 0.56359255]
+ [0.22925346, 0.21232551, 0.77616125, 0.41560093]
 
 ```
 
@@ -56,16 +57,16 @@ julia> label_seq3 = [onehot(2, 1:2), onehot(1, 1:2)]
 julia> label_seq4 = [onehot(2, 1:2), onehot(2, 1:2)]
 
 julia> crf_loss(c, input_seq, label_seq1, init_α)
-1.9206894963901504 (tracked)
+1.33554f0
 
 julia> crf_loss(c, input_seq, label_seq2, init_α)
-1.4972745472075206 (tracked)
+1.2327178f0
 
 julia> crf_loss(c, input_seq, label_seq3, init_α)
-1.543210471592448 (tracked)
+1.3454239f0
 
 julia> crf_loss(c, input_seq, label_seq4, init_α)
-0.876923329893466 (tracked)
+1.6871009f0
 
 ```
 
@@ -75,9 +76,9 @@ We can decode this using Viterbi Decode.
 
 ```julia
 julia> viterbi_decode(c, input_seq, init_α) # Gives the label_sequence with least loss
-2-element Array{Flux.OneHotVector,1}:
- [false, true]
- [false, true]
+2-element Vector{Flux.OneHotArray{UInt32, 2, 0, 1, UInt32}}:
+ [1, 0]
+ [0, 1]
 
 ```
 
@@ -96,7 +97,7 @@ CRFs smoothly work over Flux layers-
 julia> NUM_FEATURES = 20
 
 julia> input_seq = [rand(NUM_FEATURES) for i in 1:SEQUENCE_LENGTH]
-2-element Array{Array{Float64,1},1}:
+2-element Vector{Vector{Float32}}:
  [0.948219, 0.719964, 0.352734, 0.0677656, 0.570564, 0.187673, 0.525125, 0.787807, 0.262452, 0.472472, 0.573259, 0.643369, 0.00592054, 0.945258, 0.951466, 0.323156, 0.679573, 0.663285, 0.218595, 0.152846]
  [0.433295, 0.11998, 0.99615, 0.530107, 0.188887, 0.897213, 0.993726, 0.0799431, 0.953333, 0.941808, 0.982638, 0.0919345, 0.27504, 0.894169, 0.66818, 0.449537, 0.93063, 0.384957, 0.415114, 0.212203]
 
@@ -105,7 +106,7 @@ julia> m1 = Dense(NUM_FEATURES, NUM_LABELS + 2)
 julia> loss1(input_seq, label_seq) = crf_loss(c, m1.(input_seq), label_seq, init_α) # loss for model m1
 
 julia> loss1(input_seq,  [onehot(1, 1:2), onehot(1, 1:2)])
-4.6620379898687485 (tracked)
+4.6620379898687485
 
 ```
 
@@ -124,7 +125,7 @@ julia> m2(x) = dense_out.(lstm.(x))
 julia> loss2(input_seq, label_seq) = crf_loss(c, m2(input_seq), label_seq, init_α) # loss for model m2
 
 julia> loss2(input_seq,  [onehot(1, 1:2), onehot(1, 1:2)])
-1.6501050910529504 (tracked)
+1.6501050910529504
 
 julia> reset!(lstm)
 ```
diff --git a/docs/src/index.md b/docs/src/index.md
index 2168e45..8c36217 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -2,6 +2,8 @@
 
 The TextModels package enhances the TextAnalysis package with end-user focussed, practical natural language models, typically based on neural networks (in this case, [Flux](https://fluxml.ai/))
 
+This package depends on the [TextAnalysis](https://github.com/JuliaText/TextAnalysis.jl) package, which contains basic algorithms to deal with textual documetns. 
+
 ## Installation
 
 The TextModels package can be installed using Julia's package manager:
diff --git a/docs/src/sentiment.md b/docs/src/sentiment.md
new file mode 100644
index 0000000..e2cfe57
--- /dev/null
+++ b/docs/src/sentiment.md
@@ -0,0 +1,41 @@
+## Sentiment Analyzer
+
+It can be used to find the sentiment score (between 0 and 1) of a word, sentence or a Document.
+A trained model (using Flux) on IMDB word corpus with weights saved are used to calculate the sentiments.
+
+    model = SentimentAnalyzer()
+    model(doc)
+    model(doc, handle_unknown)
+
+*  doc              = Input Document for calculating document (AbstractDocument type)
+*  handle_unknown   = A function for handling unknown words. Should return an array (default (x)->[])
+
+```julia
+julia> using TextAnalysis
+
+julia> m = SentimentAnalyzer()
+Sentiment Analysis Model Trained on IMDB with a 88587 word corpus
+
+julia> d1 = StringDocument("a very nice thing that everyone likes")
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: a very nice thing that everyone likes
+
+julia> m(d1)
+0.5183109f0
+
+julia> d = StringDocument("a horrible thing that everyone hates")
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: a horrible thing that everyone hates
+
+julia> m(d2)
+0.47193584f0
+
+```
diff --git a/docs/src/tagging.md b/docs/src/tagging.md
new file mode 100644
index 0000000..90d85cf
--- /dev/null
+++ b/docs/src/tagging.md
@@ -0,0 +1,237 @@
+## Tagging_schemes
+
+There are many tagging schemes used for sequence labelling.
+TextAnalysis currently offers functions for conversion between these tagging format.
+
+*   BIO1
+*   BIO2
+*   BIOES
+
+```julia
+julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"]
+
+julia> tag_scheme!(tags, "BIO1", "BIOES")
+
+julia> tags
+8-element Array{String,1}:
+ "S-LOC"
+ "O"
+ "S-PER"
+ "B-MISC"
+ "E-MISC"
+ "B-PER"
+ "I-PER"
+ "E-PER"
+```
+
+## Parts of Speech Tagging
+
+This package provides with two different Part of Speech Tagger.
+
+## Average Perceptron Part of Speech Tagger
+
+This tagger can be used to find the POS tag of a word or token in a given sentence. It is a based on `Average Perceptron Algorithm`.
+The model can be trained from scratch and weights are saved in specified location.
+The pretrained model can also be loaded and can be used directly to predict tags.
+
+### To train model:
+```julia
+julia> tagger = PerceptronTagger(false) #we can use tagger = PerceptronTagger()
+julia> fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]])
+iteration : 1
+iteration : 2
+iteration : 3
+iteration : 4
+iteration : 5
+```
+
+### To load pretrained model:
+```julia
+julia> tagger = PerceptronTagger(true)
+loaded successfully
+PerceptronTagger(AveragePerceptron(Set(Any["JJS", "NNP_VBZ", "NN_NNS", "CC", "NNP_NNS", "EX", "NNP_TO", "VBD_DT", "LS", ("Council", "NNP")  …  "NNPS", "NNP_LS", "VB", "NNS_NN", "NNP_SYM", "VBZ", "VBZ_JJ", "UH", "SYM", "NNP_NN", "CD"]), Dict{Any,Any}("i+2 word wetlands"=>Dict{Any,Any}("NNS"=>0.0,"JJ"=>0.0,"NN"=>0.0),"i-1 tag+i word NNP basic"=>Dict{Any,Any}("JJ"=>0.0,"IN"=>0.0),"i-1 tag+i word DT chloride"=>Dict{Any,Any}("JJ"=>0.0,"NN"=>0.0),"i-1 tag+i word NN choo"=>Dict{Any,Any}("NNP"=>0.0,"NN"=>0.0),"i+1 word antarctica"=>Dict{Any,Any}("FW"=>0.0,"NN"=>0.0),"i-1 tag+i word -START- appendix"=>Dict{Any,Any}("NNP"=>0.0,"NNPS"=>0.0,"NN"=>0.0),"i-1 word wahoo"=>Dict{Any,Any}("JJ"=>0.0,"VBD"=>0.0),"i-1 tag+i word DT children's"=>Dict{Any,Any}("NNS"=>0.0,"NN"=>0.0),"i word dnipropetrovsk"=>Dict{Any,Any}("NNP"=>0.003,"NN"=>-0.003),"i suffix hla"=>Dict{Any,Any}("JJ"=>0.0,"NN"=>0.0)…), DefaultDict{Any,Any,Int64}(), DefaultDict{Any,Any,Int64}(), 1, ["-START-", "-START2-"]), Dict{Any,Any}("is"=>"VBZ","at"=>"IN","a"=>"DT","and"=>"CC","for"=>"IN","by"=>"IN","Retrieved"=>"VBN","was"=>"VBD","He"=>"PRP","in"=>"IN"…), Set(Any["JJS", "NNP_VBZ", "NN_NNS", "CC", "NNP_NNS", "EX", "NNP_TO", "VBD_DT", "LS", ("Council", "NNP")  …  "NNPS", "NNP_LS", "VB", "NNS_NN", "NNP_SYM", "VBZ", "VBZ_JJ", "UH", "SYM", "NNP_NN", "CD"]), ["-START-", "-START2-"], ["-END-", "-END2-"], Any[])
+```
+
+### To predict tags:
+
+The perceptron tagger can predict tags over various document types-
+
+    predict(tagger, sentence::String)
+    predict(tagger, Tokens::Array{String, 1})
+    predict(tagger, sd::StringDocument)
+    predict(tagger, fd::FileDocument)
+    predict(tagger, td::TokenDocument)
+
+This can also be done by -
+    tagger(input)
+
+
+```julia
+julia> predict(tagger, ["today", "is"])
+2-element Array{Any,1}:
+ ("today", "NN")
+ ("is", "VBZ")
+
+julia> tagger(["today", "is"])
+2-element Array{Any,1}:
+ ("today", "NN")
+ ("is", "VBZ")
+```
+
+`PerceptronTagger(load::Bool)`
+
+* load      = Boolean argument if `true` then pretrained model is loaded
+
+`fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String, nr_iter::Integer)`
+
+* self      = `PerceptronTagger` object
+* sentences = `Vector` of `Vector` of `Tuple` of pair of word or token and its POS tag [see above example]
+* save_loc  = location of file to save the trained weights
+* nr_iter   = Number of iterations to pass the `sentences` to train the model ( default 5)
+
+`predict(self::PerceptronTagger, tokens)`
+
+* self      = PerceptronTagger
+* tokens    = `Vector` of words or tokens for which to predict tags
+
+## Neural Model for Part of Speech tagging using LSTMs, CNN and CRF
+
+The API provided is a pretrained model for tagging Part of Speech.
+The current model tags all the POS Tagging is done based on [convention used in Penn Treebank](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html), with 36 different Part of Speech tags excludes punctuation.
+
+To use the API, we first load the model weights into an instance of tagger.
+The function also accepts the path of model_weights and model_dicts (for character and word embeddings)
+
+    PoSTagger()
+    PoSTagger(dicts_path, weights_path)
+
+```julia
+julia> pos = PoSTagger()
+
+```
+
+!!! note
+    When you call `PoSTagger()` for the first time, the package will request permission for download the `Model_dicts` and `Model_weights`. Upon downloading, these are store locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again.
+
+Once we create an instance, we can call it to tag a String (sentence), sequence of tokens, `AbstractDocument` or `Corpus`.
+
+    (pos::PoSTagger)(sentence::String)
+    (pos::PoSTagger)(tokens::Array{String, 1})
+    (pos::PoSTagger)(sd::StringDocument)
+    (pos::PoSTagger)(fd::FileDocument)
+    (pos::PoSTagger)(td::TokenDocument)
+    (pos::PoSTagger)(crps::Corpus)
+
+```julia
+
+julia> sentence = "This package is maintained by John Doe."
+"This package is maintained by John Doe."
+
+julia> tags = pos(sentence)
+8-element Array{String,1}:
+ "DT"
+ "NN"
+ "VBZ"
+ "VBN"
+ "IN"
+ "NNP"
+ "NNP"
+ "."
+
+```
+
+The API tokenizes the input sentences via the default tokenizer provided by `WordTokenizers`, this currently being set to the multilingual `TokTok Tokenizer.`
+
+```
+
+julia> using WordTokenizers
+
+julia> collect(zip(WordTokenizers.tokenize(sentence), tags))
+8-element Array{Tuple{String,String},1}:
+ ("This", "DT")
+ ("package", "NN")
+ ("is", "VBZ")
+ ("maintained", "VBN")
+ ("by", "IN")
+ ("John", "NNP")
+ ("Doe", "NNP")
+ (".", ".")
+
+```
+
+For tagging a multisentence text or document, once can use `split_sentences` from `WordTokenizers.jl` package and run the pos model on each.
+
+```julia
+julia> sentences = "Rabinov is winding up his term as ambassador. He will be replaced by Eliahu Ben-Elissar, a former Israeli envoy to Egypt and right-wing Likud party politiian." # Sentence taken from CoNLL 2003 Dataset
+
+julia> splitted_sents = WordTokenizers.split_sentences(sentences)
+
+julia> tag_sequences = pos.(splitted_sents)
+2-element Array{Array{String,1},1}:
+ ["NNP", "VBZ", "VBG", "RP", "PRP\$", "NN", "IN", "NN", "."]
+ ["PRP", "MD", "VB", "VBN", "IN", "NNP", "NNP", ",", "DT", "JJ", "JJ", "NN", "TO", "NNP", "CC", "JJ", "NNP", "NNP", "NNP", "."]
+
+julia> zipped = [collect(zip(tag_sequences[i], WordTokenizers.tokenize(splitted_sents[i]))) for i in eachindex(splitted_sents)]
+
+julia> zipped[1]
+9-element Array{Tuple{String,String},1}:
+ ("NNP", "Rabinov")
+ ("VBZ", "is")
+ ("VBG", "winding")
+ ("RP", "up")
+ ("PRP\$", "his")
+ ("NN", "term")
+ ("IN", "as")
+ ("NN", "ambassador")
+ (".", ".")
+
+julia> zipped[2]
+20-element Array{Tuple{String,String},1}:
+ ("PRP", "He")
+ ("MD", "will")
+ ("VB", "be")
+ ("VBN", "replaced")
+ ("IN", "by")
+ ("NNP", "Eliahu")
+ ("NNP", "Ben-Elissar")
+ (",", ",")
+ ("DT", "a")
+ ("JJ", "former")
+ ("JJ", "Israeli")
+ ("NN", "envoy")
+ ("TO", "to")
+ ("NNP", "Egypt")
+ ("CC", "and")
+ ("JJ", "right-wing")
+ ("NNP", "Likud")
+ ("NNP", "party")
+ ("NNP", "politiian")
+ (".", ".")
+
+```
+
+Since the tagging the Part of Speech is done on sentence level,
+the text of `AbstractDocument` is sentence_tokenized and then labelled for over sentence.
+However is not possible for `NGramDocument` as text cannot be recreated.
+For `TokenDocument`, text is approximated for splitting into sentences, hence the following throws a warning when tagging the `Corpus`.
+
+```julia
+
+julia> crps = Corpus([StringDocument("We aRE vErY ClOSE tO ThE HEaDQuarTeRS."), TokenDocument("this is Bangalore.")])
+A Corpus with 2 documents:
+ * 1 StringDocument's
+ * 0 FileDocument's
+ * 1 TokenDocument's
+ * 0 NGramDocument's
+
+Corpus's lexicon contains 0 tokens
+Corpus's index contains 0 tokens
+
+julia> pos(crps)
+┌ Warning: TokenDocument's can only approximate the original text
+└ @ TextAnalysis ~/.julia/dev/TextAnalysis/src/document.jl:220
+2-element Array{Array{Array{String,1},1},1}:
+ [["PRP", "VBP", "RB", "JJ", "TO", "DT", "NN", "."]]
+ [["DT", "VBZ", "NNP", "."]]
+
+```
diff --git a/src/CRF/crf.jl b/src/CRF/crf.jl
index 98ffc23..3145d89 100644
--- a/src/CRF/crf.jl
+++ b/src/CRF/crf.jl
@@ -22,10 +22,10 @@ function CRF(n::Integer)
     W[:, n + 1] .= -10000
     W[n + 2, :] .= -10000
 
-    return CRF(param(W), n)
+    return CRF(W, n)
 end
 
-@treelike CRF
+@functor CRF
 
 function Base.show(io::IO, c::CRF)
     print(io, "CRF with ", c.n + 2, " distinct tags (including START and STOP tags).")
diff --git a/src/CRF/loss.jl b/src/CRF/loss.jl
index 495816d..32501bd 100644
--- a/src/CRF/loss.jl
+++ b/src/CRF/loss.jl
@@ -5,13 +5,13 @@ Compute the Normalization / partition function
 or the Forward Algorithm score - `Z`
 """
 function forward_score(c::CRF, x, init_α)
-    forward_var = log_sum_exp((c.W .+ x[1]') .+ init_α)
+    forward_var = log_sum_exp(c.W .+ x[1]' .+ init_α)
 
     for i in 2:length(x)
         forward_var = log_sum_exp((c.W .+ x[i]') .+ forward_var')
     end
 
-    return log_sum_exp(c.W[:, c.n + 2] + forward_var')[1]
+    return log_sum_exp(c.W[:, c.n + 2] .+ forward_var')[1]
 end
 
 """
diff --git a/src/CRF/predict.jl b/src/CRF/predict.jl
index 29e2c34..3225b70 100644
--- a/src/CRF/predict.jl
+++ b/src/CRF/predict.jl
@@ -35,14 +35,14 @@ Computes the forward pass for viterbi algorithm.
 function _decode(c::CRF, x, init_vit_vars)
     α_idx = zeros(Int, c.n + 2, length(x))
 
-    forward_var, α_idx[:, 1] = forward_pass_unit(Tracker.data((c.W .+ x[1]') .+ init_vit_vars))
+    forward_var, α_idx[:, 1] = forward_pass_unit((c.W .+ x[1]') .+ init_vit_vars)
 
     for i in 2:length(x)
-        forward_var, α_idx[:, i] = forward_pass_unit(Tracker.data((c.W .+ x[i]') .+ forward_var'))
+        forward_var, α_idx[:, i] = forward_pass_unit((c.W .+ x[i]') .+ forward_var')
     end
 
     labels = zeros(Int, length(x))
-    labels[end] = argmax(forward_var + Tracker.data(c.W[:, c.n + 2])')[2]
+    labels[end] = argmax(forward_var + (c.W[:, c.n + 2])')[2]
 
     for i in reverse(2:length(x))
         labels[i - 1] =  α_idx[labels[i], i]
diff --git a/src/TextModels.jl b/src/TextModels.jl
index a82ec68..5c88496 100644
--- a/src/TextModels.jl
+++ b/src/TextModels.jl
@@ -7,8 +7,8 @@ module TextModels
     using Pkg.Artifacts
 
 
-    using Flux, Tracker
-    using Flux: identity, onehot, onecold, @treelike, onehotbatch
+    using Flux, Zygote
+    using Flux: identity, onehot, onecold, @functor, onehotbatch
 
 
     using TextAnalysis
@@ -36,15 +36,17 @@ module TextModels
     include("sequence/pos_datadeps.jl")
     include("sequence/pos.jl")
     include("sequence/sequence_models.jl")
-    
-    
+     
+   
     # ULMFiT
     module ULMFiT
-        using ..TextAnalysis
-        using DataDeps
+        using TextAnalysis
         using Flux
-        using Tracker
+        using Flux:crossentropy
+        using Zygote
         using BSON
+        using CorpusLoaders
+        using DataDeps
         include("ULMFiT/utils.jl")
         include("ULMFiT/datadeps.jl")
         include("ULMFiT/data_loaders.jl")
@@ -60,7 +62,7 @@ module TextModels
         ner_datadep_register()
         pos_datadep_register()
         ULMFiT.ulmfit_datadep_register()
-
+    
         global sentiment_model = artifact"sentiment_model"
     end
 end
diff --git a/src/ULMFiT/custom_layers.jl b/src/ULMFiT/custom_layers.jl
index e402c7d..d83c43c 100644
--- a/src/ULMFiT/custom_layers.jl
+++ b/src/ULMFiT/custom_layers.jl
@@ -8,7 +8,7 @@ This file contains the custom layers defined for this model:
     PooledDense
 """
 
-import Flux: gate, _testmode!, _dropout_kernel
+import Flux: gate, testmode!, _dropout_kernel
 
 reset_masks!(entity) = nothing
 reset_probability!(entity) = nothing
@@ -44,12 +44,12 @@ Moreover this also follows the Vartional DropOut citeria, that is,
 the drop mask is remains same for a whole training pass.
 This is done by saving the masks in 'maskWi' and 'maskWh' fields
 """
-mutable struct WeightDroppedLSTMCell{A, V, M}
+mutable struct WeightDroppedLSTMCell{A, V, S, M}
     Wi::A
     Wh::A
     b::V
-    h::V
-    c::V
+    h::S
+    c::S
     p::Float64
     maskWi::M
     maskWh::M
@@ -60,17 +60,17 @@ function WeightDroppedLSTMCell(in::Integer, out::Integer, p::Float64=0.0;
     init = Flux.glorot_uniform)
     @assert 0 ≤ p ≤ 1
     cell = WeightDroppedLSTMCell(
-        param(init(out*4, in)),
-        param(init(out*4, out)),
-        param(init(out*4)),
-        param(zeros(Float32, out)),
-        param(zeros(Float32, out)),
+        init(out*4, in),
+        init(out*4, out),
+        init(out*4),
+        reshape(zeros(Float32, out), out, 1),
+        reshape(zeros(Float32, out), out, 1),
         p,
         drop_mask((out*4, in), p),
         drop_mask((out*4, out), p),
         true
     )
-    cell.b.data[gate(out, 2)] .= 1
+    cell.b[gate(out, 2)] .= 1
     return cell
 end
 
@@ -88,9 +88,12 @@ function (m::WeightDroppedLSTMCell)((h, c), x)
     return (h′, c), h′
 end
 
-Flux.@treelike WeightDroppedLSTMCell
+Flux.@functor WeightDroppedLSTMCell
 
-_testmode!(m::WeightDroppedLSTMCell, test) = (m.active = !test)
+Flux.trainable(m::WeightDroppedLSTMCell) = (m.Wi, m.Wh, m.b, m.h, m.c)
+
+testmode!(m::WeightDroppedLSTMCell, mode=true) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 """
     WeightDroppedLSTM(in::Integer, out::Integer, p::Float64=0.0)
@@ -106,9 +109,25 @@ julia> wd = WeightDroppedLSTM(4, 5, 0.3);
 function WeightDroppedLSTM(a...; kw...)
     cell = WeightDroppedLSTMCell(a...;kw...)
     hidden = (cell.h, cell.c)
-    return Flux.Recur(cell, hidden, hidden)
+    return Flux.Recur(cell, hidden)
 end
 
+"""
+    reset!(m)
+
+Resets the h, c parameters of the LSTM Cell.
+    
+For more refer [`Flux.reset`](@ref https://fluxml.ai/Flux.jl/stable/models/layers/#Flux.reset!)
+"""
+function reset!(m)
+    try		# to accomodate the definition in previously trained Language Model
+        (m.state = (m.cell.h, m.cell.c))
+    catch
+    	Flux.reset!(m)
+    end
+end
+   
+
 """
     reset_masks!(layer)
 
@@ -155,7 +174,9 @@ end
 
 AWD_LSTM(in::Integer, out::Integer, p::Float64=0.0; kw...) = AWD_LSTM(WeightDroppedLSTM(in, out, p; kw...), -1, [])
 
-Flux.@treelike AWD_LSTM
+Flux.@functor AWD_LSTM
+
+Flux.trainable(m::AWD_LSTM) = (m.layer,)
 
 (m::AWD_LSTM)(in) = m.layer(in)
 
@@ -184,12 +205,12 @@ function asgd_step!(iter::Integer, layer::AWD_LSTM)
         p = get_trainable_params([layer])
         avg_fact = 1/max(iter - layer.T + 1, 1)
         if avg_fact != 1
-            layer.accum = layer.accum .+ Tracker.data.(p)
+            layer.accum = layer.accum .+ p
             for (ps, accum) in zip(p, layer.accum)
-                Tracker.data(ps) .= avg_fact*accum
+                ps .= avg_fact*accum
             end
         else
-            layer.accum = deepcopy(Tracker.data.(p))   # Accumulator for ASGD
+            layer.accum = deepcopy(p)   # Accumulator for ASGD
         end
     end
     return
@@ -230,7 +251,8 @@ function (vd::VarDrop)(x)
     return (x .* vd.mask)
 end
 
-_testmode!(vd::VarDrop, test) = (vd.active = !test)
+testmode!(m::VarDrop, mode=true) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 # method for reseting mask of VarDrop
 reset_masks!(vd::VarDrop) = (vd.reset = true)
@@ -270,7 +292,7 @@ end
 function DroppedEmbeddings(in::Integer, embed_size::Integer, p::Float64=0.0;
     init = Flux.glorot_uniform)
         de = DroppedEmbeddings{AbstractArray, typeof(p)}(
-            param(init(in, embed_size)),
+            init(in, embed_size),
             p,
             drop_mask((in,), p),
             true
@@ -283,9 +305,12 @@ function (de::DroppedEmbeddings)(x::AbstractArray, tying::Bool=false)
     return tying ? dropped * x : transpose(dropped[x, :])
 end
 
-Flux.@treelike DroppedEmbeddings
+Flux.@functor DroppedEmbeddings
+
+Flux.trainable(m::DroppedEmbeddings) = (m.emb,)
 
-_testmode!(de::DroppedEmbeddings, test) = (de.active = !test)
+testmode!(m::DroppedEmbeddings, mode=true) =
+  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function reset_masks!(de::DroppedEmbeddings)
     de.mask = drop_mask(de.mask, de.p)
@@ -324,10 +349,10 @@ PooledDense(W, b) = PooledDense(W, b, identity)
 
 function PooledDense(hidden_sz::Integer, out::Integer, σ = identity;
              initW = Flux.glorot_uniform, initb = (dims...) -> zeros(Float32, dims...))
-return PooledDense(param(initW(out, hidden_sz*3)), param(initb(out)), σ)
+return PooledDense(initW(out, hidden_sz*3), initb(out), σ)
 end
 
-Flux.@treelike PooledDense
+Flux.@functor PooledDense
 
 function (a::PooledDense)(x)
     W, b, σ = a.W, a.b, a.σ
diff --git a/src/ULMFiT/data_loaders.jl b/src/ULMFiT/data_loaders.jl
index f59e403..839b408 100644
--- a/src/ULMFiT/data_loaders.jl
+++ b/src/ULMFiT/data_loaders.jl
@@ -27,29 +27,29 @@ function imdb_preprocess(doc::AbstractDocument)
         length(word) == 1 && return [word]
         return split(word, symbol)
     end
-    text = text(doc)
-    remove_corrupt_utf8!(text)
-    remove_case!(text)
-    prepare!(text, strip_html_tags)
-    tokens = tokens(text)
+    text_ = doc
+    remove_corrupt_utf8!(text_)
+    remove_case!(text_)
+    prepare!(text_, strip_html_tags)
+    tokens_ = tokens(text_)
     for symbol in [',', '.', '-', '/', "'s"]
-        tokens = split_word.(tokens, symbol)
+        tokens_ = split_word.(tokens_, symbol)
         temp = []
-        for token in tokens
+        for token_ in tokens_
             try
-                append!(temp, put(token, symbol))
+                append!(temp, put(token_, symbol))
             catch
-                append!(temp, token)
+                append!(temp, token_)
             end
         end
-        tokens = temp
+        tokens_ = temp
     end
-    deleteat!(tokens, findall(x -> isequal(x, "")||isequal(x, " "), tokens))
-    return tokens
+    deleteat!(tokens_, findall(x -> isequal(x, "")||isequal(x, " "), tokens_))
+    return tokens_
 end
 
 # Loads WikiText-103 corpus and output a Channel to give a mini-batch at each call
-function load_wikitext_103(batchsize::Integer, bptt::Integer; type = "train")
+function load_wikitext_103(batchsize::Integer=16, bptt::Integer=70; type = "train")
     corpuspath = joinpath(datadep"WikiText-103", "wiki.$(type).tokens")
     corpus = read(open(corpuspath, "r"), String)
     corpus = tokenize(corpus)
@@ -58,13 +58,13 @@ end
 
 # IMDB Data loaders for Sentiment Analysis specifically
 # IMDB data loader for fine-tuning Language Model
-function imdb_fine_tune_data(batchsize::Integer, bptt::Integer, num_examples::Integer=50000)
+function imdb_fine_tune_data(batchsize::Integer=16, bptt::Integer=70, num_examples::Integer=50000)
     imdb_dataset = IMDB("train_unsup")
     dataset = []
-    for path in imdb_dataset.filepaths   #extract data from the files in directory and put into channel
+    for path in imdb_dataset.filepaths[1:num_examples]   #extract data from the files in directory and put into channel
         open(path) do fileio
             cur_text = read(fileio, String)
-            append!(dataset, imdb_preprocess(cur_text))
+            append!(dataset, imdb_preprocess(StringDocument(cur_text)))
         end #open
     end #for
     return Channel(x -> generator(x, dataset; batchsize=batchsize, bptt=bptt))
diff --git a/src/ULMFiT/fine_tune_lm.jl b/src/ULMFiT/fine_tune_lm.jl
index 17f33b9..22a08d3 100644
--- a/src/ULMFiT/fine_tune_lm.jl
+++ b/src/ULMFiT/fine_tune_lm.jl
@@ -24,17 +24,17 @@ opts        : `Vector` of optimizers used to update weights for corresponding la
 
 NOTE: length(opts) == length(layers)
 """
-function discriminative_step!(layers, ηL::Float64, l, opts::Vector)
+function discriminative_step!(layers, lm::LanguageModel, gen, ηL::Float64, opts::Vector)
     @assert length(opts) == length(layers)
     # Gradient calculation
-    grads = Tracker.gradient(() -> l, get_trainable_params(layers))
+    grads = Zygote.gradient(() -> loss(lm, gen), get_trainable_params(layers))
 
     # discriminative step
     ηl = ηL/(2.6^(length(layers)-1))
     for (layer, opt) in zip(layers, opts)
         opt.eta = ηl
         for ps in get_trainable_params([layer])
-            Tracker.update!(opt, ps, grads[ps])
+            Flux.Optimise.update!(opt, ps, grads[ps])
         end
         ηl *= 2.6
     end
@@ -47,35 +47,31 @@ end
             epochs::Integer=1, checkpoint_itvl::Integer=5000)
 
 This function contains main training loops for fine-tuning the language model.
-To use this funciton, an instance of LanguageModel and a data loader is needed.
+To use this function, an instance of LanguageModel and a data loader is needed.
 Read the docs for more info about arguments
 """
-function fine_tune_lm!(lm::LanguageModel, data_loader::Channel=imdb_fine_tune_data,
-        stlr_cut_frac::Float64=0.1, stlr_ratio::Float32=32, stlr_η_max::Float64=4e-3;
+function fine_tune_lm!(lm=LanguageModel(), data_loader=imdb_fine_tune_data,
+        stlr_cut_frac::Float64=0.1, stlr_ratio::Float32=Float32(32), stlr_η_max::Float64=4e-3;
         epochs::Integer=1, checkpoint_itvl::Integer=5000)
 
     opts = [ADAM(0.001, (0.7, 0.99)) for i=1:4]
-    cut = num_of_iters * epochs * stlr_cut_frac
-
+    
     # Fine-Tuning loops
     for epoch=1:epochs
         println("\nEpoch: $epoch")
-        gen = data_loader()
-        num_of_iters = take!(gen)
+        gen = data_loader() 
+	num_of_iters = take!(gen)
+	cut = num_of_iters * epochs * stlr_cut_frac
         T = num_of_iters-Int(floor((num_of_iters*2)/100))
         set_trigger!.(T, lm.layers)
         for i=1:num_of_iters
-
-            # FORWARD
-            l = loss(lm, gen)
-
             # Slanted triangular learning rate step
             t = i + (epoch-1)*num_of_iters
             p_frac = (i < cut) ? i/cut : (1 - ((i-cut)/(cut*(1/stlr_cut_frac-1))))
             ηL = stlr_η_max*((1+p_frac*(stlr_ratio-1))/stlr_ratio)
 
             # Backprop with discriminative fine-tuning step
-            discriminative_step!(lm.layers[[1, 3, 5, 7]], ηL, l, opts)
+            discriminative_step!(lm.layers[[1, 3, 5, 7]], lm, gen, ηL, opts)
 
             # Resets dropout masks for all the layers with DropOut or DropConnect
             reset_masks!.(lm.layers)
@@ -121,7 +117,7 @@ julia> insert!(vocab, 2, "_pad_")
 function set_vocab!(lm::LanguageModel, vocab::Vector)
     idxs = indices(vocab, lm.vocab)
     lm.vocab = vocab
-    lm.layers[1].emb = param(Tracker.data(lm.layers[1].emb)[idxs, :])
+    lm.layers[1].emb = param(lm.layers[1].emb[idxs, :])
     lm.layers[1].mask = gpu(drop_mask((length(vocab),), lm.layers[1].p))
     return
 end
diff --git a/src/ULMFiT/pretrain_lm.jl b/src/ULMFiT/pretrain_lm.jl
index 74bc573..e659f8e 100644
--- a/src/ULMFiT/pretrain_lm.jl
+++ b/src/ULMFiT/pretrain_lm.jl
@@ -49,7 +49,7 @@ function LanguageModel(load_pretrained::Bool=false, vocabpath::String=joinpath(@
     return lm
 end
 
-Flux.@treelike LanguageModel
+Flux.@functor LanguageModel
 
 """
     test_lm(lm::LanguageModel, data_gen, num_of_iters::Integer; unknown_token::String="_unk_")
@@ -63,7 +63,7 @@ It returns loss, accuracy, precsion, recall and F1 score.
 julia> test_lm(lm, data_gen, 200, "<unk")
 """
 function test_lm(lm::LanguageModel, data_gen, num_of_iters::Integer; unknown_token::String="_unk_")
-    model_layers = mapleaves(Tracker.data, lm.layers)
+    model_layers = lm.layers
     testmode!(model_layers)
     loss = 0
     len = length(vocab)
@@ -93,6 +93,7 @@ end
 # computes the forward pass while training
 function forward(lm, batch)
     batch = map(x -> indices(x, lm.vocab, "_unk_"), batch)
+    batch = gpu(batch)
     batch = lm.layers.(batch)
     return batch
 end
@@ -101,17 +102,17 @@ end
 function loss(lm, gen)
     H = forward(lm, take!(gen))
     Y = broadcast(x -> gpu(Flux.onehotbatch(x, lm.vocab, "_unk_")), take!(gen))
-    l = sum(crossentropy.(H, Y))
-    Flux.truncate!(lm.layers)
+    l = sum(Flux.crossentropy.(H, Y))
+    reset!(lm.layers)
     return l
 end
 
 # Backpropagation step while training
-function backward!(layers, l, opt)
+function backward!(layers, lm, gen, opt)
     # Calulating gradients and weights updation
     p = get_trainable_params(layers)
-    grads = Tracker.gradient(() -> l, p)
-    Tracker.update!(opt, p, grads)
+    grads = Zygote.gradient(() -> loss(lm, gen), p)
+    Flux.Optimise.update!(opt, p, grads)
     return
 end
 
@@ -138,11 +139,8 @@ function pretrain_lm!(lm::LanguageModel=LanguageModel(), data_loader::Channel=lo
         set_trigger!.(T, lm.layers)  # Setting triggers for AWD_LSTM layers
         for i=1:num_of_batches
 
-            # FORWARD PASS
-            l = loss(lm, gen)
-
             # REVERSE PASS
-            backward!(lm.layers, l, opt)
+            backward!(lm.layers, lm, gen, opt)
 
             # ASGD Step, works after Triggering
             asgd_step!.(i, lm.layers)
@@ -158,13 +156,18 @@ end
 
 # To save model
 function save_model!(m::LanguageModel, filepath::String)
-    weights = cpu.(Tracker.data.(params(m)))
+    weights = cpu.(params(m))
     BSON.@save filepath weights
 end
 
 # To load model
 function load_model!(lm::LanguageModel, filepath::String)
     BSON.@load filepath weights
+    # reshape saved weights to match Recurr (h, c) shape
+    layers = [5, 6, 10, 11, 15, 16]
+    for l in layers
+        weights[l] = reshape(weights[l], length(weights[l]), 1)
+    end
     Flux.loadparams!(lm, weights)
 end
 
@@ -182,7 +185,7 @@ SAMPLING...
 """
 function sample(starting_text::AbstractDocument, lm::LanguageModel)
     testmode!(lm.layers)
-    model_layers = mapleaves(Tracker.data, lm.layers)
+    model_layers = lm.layers
     tokens = tokens(starting_text)
     word_indices = map(x -> indices([x], lm.vocab, "_unk_"), tokens)
     h = (model_layers.(word_indices))[end]
diff --git a/src/ULMFiT/sentiment.jl b/src/ULMFiT/sentiment.jl
index c70069d..3ab5479 100644
--- a/src/ULMFiT/sentiment.jl
+++ b/src/ULMFiT/sentiment.jl
@@ -48,12 +48,12 @@ function BinSentimentClassifier()
         )
     )
     Flux.loadparams!(sc, weights)
-    sc = mapleaves(Tracker.data, sc)
+    sc = sc
     Flux.testmode!(sc)
     return sc
 end
 
-Flux.@treelike BinSentimentClassifier
+Flux.@functor BinSentimentClassifier
 
 function (sc::BinSentimentClassifier)(x::TokenDocument)
     remove_case!(x)
diff --git a/src/ULMFiT/train_text_classifier.jl b/src/ULMFiT/train_text_classifier.jl
index e30912f..702bd21 100644
--- a/src/ULMFiT/train_text_classifier.jl
+++ b/src/ULMFiT/train_text_classifier.jl
@@ -30,7 +30,7 @@ function TextClassifier(lm::LanguageModel=LanguageModel(), clsfr_out_sz::Integer
     )
 end
 
-Flux.@treelike TextClassifier
+Flux.@functor TextClassifier
 
 """
 Cross Validate
@@ -48,7 +48,7 @@ gen will be used for validation
 """
 function validate(tc::TextClassifier, gen::Channel, num_of_batches::Union{Colon, Integer})
     n_classes = size(tc.linear_layers[end-2].W, 1)
-    classifier = mapleaves(Tracker.data, tc)
+    classifier = tc
     Flux.testmode!(classifier)
     loss = 0
     iters = take!(gen)
@@ -91,15 +91,17 @@ tracked_steps   : This is the number of tracked time-steps for Truncated Backpro
 """
 function forward(tc::TextClassifier, gen::Channel, tracked_steps::Integer=32)
   	# swiching off tracking
-    classifier = mapleaves(Tracker.data, tc)
+    classifier = tc
     X = take!(gen)
     l = length(X)
     # Truncated Backprop through time
-    for i=1:ceil(l/now_per_pass)-1   # Tracking is swiched off inside this loop
-        (i == 1 && l%now_per_pass != 0) ? (last_idx = l%now_per_pass) : (last_idx = now_per_pass)
-        H = broadcast(x -> indices(x, classifier.vocab, "_unk_"), X[1:last_idx])
-        H = classifier.rnn_layers.(H)
-        X = X[last_idx+1:end]
+    Zygote.ignore() do
+	for i=1:ceil(l/tracked_steps)-1   # Tracking is swiched off inside this loop
+	    (i == 1 && l%tracked_steps != 0) ? (last_idx = l%tracked_steps) : (last_idx = tracked_steps)
+	    H = broadcast(x -> indices(x, classifier.vocab, "_unk_"), X[1:last_idx])
+	    H = classifier.rnn_layers.(H)
+	    X = X[last_idx+1:end]
+	end
     end
     # set the lated hidden states to original model
     for (t_layer, unt_layer) in zip(tc.rnn_layers[2:end], classifier.rnn_layers[2:end])
@@ -130,7 +132,7 @@ Arguments:
 
 classifier    : Instance of TextClassifier
 gen           : 'Channel' [data loader], to give a mini-batch
-tracked_words : specifies the number of time-steps for which tracking is on
+tracked_steps : specifies the number of time-steps for which tracking is on
 """
 function loss(classifier::TextClassifier, gen::Channel, tracked_steps::Integer=32)
     H = forward(classifier, gen, tracked_steps)
@@ -140,6 +142,23 @@ function loss(classifier::TextClassifier, gen::Channel, tracked_steps::Integer=3
     return l
 end
 
+function discriminative_step!(layers, classifier::TextClassifier, gen::Channel, tracked_steps::Integer, ηL::Float64, opts::Vector)
+    @assert length(opts) == length(layers)
+    # Gradient calculation
+    grads = Zygote.gradient(() -> loss(classifier, gen, tracked_steps = tracked_steps), get_trainable_params(layers))
+
+    # discriminative step
+    ηl = ηL/(2.6^(length(layers)-1))
+    for (layer, opt) in zip(layers, opts)
+        opt.eta = ηl
+        for ps in get_trainable_params([layer])
+            Flux.Optimise.update!(opt, ps, grads[ps])
+        end
+        ηl *= 2.6
+    end
+    return
+end
+
 """
     train_classifier!(classifier::TextClassifier=TextClassifier(), classes::Integer=1,
             data_loader::Channel=imdb_classifier_data, hidden_layer_size::Integer=50;kw...)
@@ -151,7 +170,7 @@ function train_classifier!(classifier::TextClassifier=TextClassifier(), classes:
     data_loader::Channel=imdb_classifier_data, hidden_layer_size::Integer=50;
     stlr_cut_frac::Float64=0.1, stlr_ratio::Number=32, stlr_η_max::Float64=0.01,
     val_loader::Channel=nothing, cross_val_batches::Union{Colon, Integer}=:,
-    epochs::Integer=1, checkpoint_itvl=5000)
+    epochs::Integer=1, checkpoint_itvl=5000, tracked_steps::Integer=32)
 
     trainable = []
     append!(trainable, [classifier.rnn_layers[[1, 3, 5, 7]]...])
@@ -166,7 +185,6 @@ function train_classifier!(classifier::TextClassifier=TextClassifier(), classes:
         num_of_iters = take!(gen)
         cut = num_of_iters * epochs * stlr_cut_frac
         for iter=1:num_of_iters
-            l = loss(classifier, gen, now_per_pass = now_per_pass)
 
             # Slanted triangular learning rates
             t = iter + (epoch-1)*num_of_iters
@@ -175,7 +193,7 @@ function train_classifier!(classifier::TextClassifier=TextClassifier(), classes:
 
             # Gradual-unfreezing Step with discriminative fine-tuning
             unfreezed_layers, cur_opts = (epoch < length(trainable)) ? (trainable[end-epoch+1:end], opts[end-epoch+1:end]) : (trainable, opts)
-            discriminative_step!(unfreezed_layers, ηL, l, cur_opts)
+            discriminative_step!(unfreezed_layers, classifier, gen, tracked_steps,ηL, cur_opts)
 
             reset_masks!.(classifier.rnn_layers)    # reset all dropout masks
         end
@@ -203,13 +221,13 @@ All the preprocessing related to the used vocabulary should be done before using
 Use `prepare!` function to do preprocessing
 """
 function predict(tc::TextClassifier, text_sents::Corpus)
-    classifier = mapleaves(Tracker.data, tc)
+    classifier = tc
     Flux.testmode!(classifier)
     predictions = []
     expr(x) = indices(x, classifier.vocab, "_unk_")
     for text in text_sents
-        tokens = tokens(text)
-        h = classifier.rnn_layers.(expr.(tokens))
+        tokens_ = tokens(text)
+        h = classifier.rnn_layers.(expr.(tokens_))
         probability_dist = classifier.linear_layers(h)
         class = argmax(probaility_dist)
         push!(predictions, class)
diff --git a/src/ULMFiT/utils.jl b/src/ULMFiT/utils.jl
index 691354f..64bfd11 100644
--- a/src/ULMFiT/utils.jl
+++ b/src/ULMFiT/utils.jl
@@ -27,8 +27,8 @@ end
 init_weights(extreme::AbstractFloat, dims...) = randn(Float32, dims...) .* sqrt(Float32(extreme))
 
 # Generator, whenever it should be called two times since it gives X in first and y in second call
-function generator(c::Channel, corpus::AbstractDocument; batchsize::Integer=64, bptt::Integer=70)
-    X_total = post_pad_sequences(chunk(tokens(corpus), batchsize))
+function generator(c::Channel, corpus; batchsize::Integer=64, bptt::Integer=70)
+    X_total = post_pad_sequences(Flux.chunk(corpus, batchsize))
     n_batches = Int(floor(length(X_total[1])/bptt))
     put!(c, n_batches)
     for i=1:n_batches
diff --git a/src/sequence/pos.jl b/src/sequence/pos.jl
index 9346a3a..b23c210 100644
--- a/src/sequence/pos.jl
+++ b/src/sequence/pos.jl
@@ -1,4 +1,4 @@
-using BSON, Tracker
+using BSON
 
 const PoSCharUNK = '¿'
 const PoSWordUNK = "<UNK>"
diff --git a/src/sequence/sequence_models.jl b/src/sequence/sequence_models.jl
index b19e6a0..8b4a3a6 100644
--- a/src/sequence/sequence_models.jl
+++ b/src/sequence/sequence_models.jl
@@ -1,4 +1,4 @@
-using BSON, Tracker
+using BSON
 mutable struct BiLSTM_CNN_CRF_Model{C, W, L, D, O, A}
     labels::Array{String, 1} # List of Labels
     chars_idx#::Dict{Char, Integer} # Dict that maps chars to indices in W_Char_Embed
@@ -33,32 +33,32 @@ function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Wor
     init_α[n + 1] = 0
 
     # Word and Character Embeddings.
-    W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu]
-    W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu]
+    W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu][:, 1:end-1]	# no padding char token here
+    W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu][:, 1:end-1]	# no padding word token here
 
     # Forward_LSTM
     forward_wts = BSON.load(joinpath(weights_path, "forward_lstm.bson"))
     forward_lstm = Flux.Recur(Flux.LSTMCell(forward_wts[:lstm_2], # Wi
                                             forward_wts[:lstm_1], # Wh
                                             forward_wts[:lstm_3], # b
-                                            forward_wts[:lstm_4], # h
-                                            forward_wts[:lstm_5]  # c
+                                            (reshape(forward_wts[:lstm_4], length(forward_wts[:lstm_4]), 1), # h
+                                            reshape(forward_wts[:lstm_5], length(forward_wts[:lstm_5]), 1))  # c
                                            ),
-                              forward_wts[:lstm_init],
-                              forward_wts[:lstm_state]
-                             )
+                                 (reshape(forward_wts[:lstm_state][1], length(forward_wts[:lstm_state][1]), 1), # h
+                                            reshape(forward_wts[:lstm_state][2], length(forward_wts[:lstm_state][2]), 1)) 
+                              )
 
     # Backward_LSTM
     backward_wts = BSON.load(joinpath(weights_path, "backward_lstm.bson"))
     backward = Flux.Recur(Flux.LSTMCell(backward_wts[:lstm_2], # Wi
                                              backward_wts[:lstm_1], # Wh
                                              backward_wts[:lstm_3], # b
-                                             backward_wts[:lstm_4], # h
-                                             backward_wts[:lstm_5]  # c
-                                            ),
-                               backward_wts[:lstm_init],
-                               backward_wts[:lstm_state]
-                              )
+                                             (reshape(backward_wts[:lstm_4], length(backward_wts[:lstm_4]), 1), # h
+                                            reshape(backward_wts[:lstm_5], length(backward_wts[:lstm_5]), 1))  # c
+                                           ),
+                                 (reshape(backward_wts[:lstm_state][1], length(backward_wts[:lstm_state][1]), 1), # h
+                                            reshape(backward_wts[:lstm_state][2], length(backward_wts[:lstm_state][2]), 1))                   
+                          )
 
     # Dense
     d_weights_bias = BSON.load(joinpath(weights_path, "d_cpu.bson"))
@@ -69,7 +69,7 @@ function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Wor
 
     # Load CRF.
     crf_wt = BSON.load(joinpath(weights_path, "crf_cpu.bson"))[:crf_Weights]
-    c = TextModels.CRF(crf_wt, size(crf_wt)[1] - 2)
+    c = CRF(crf_wt, size(crf_wt)[1] - 2)
 
     # Load Conv
     conv_wt_bias = BSON.load(joinpath(weights_path, "conv_cpu.bson"))
@@ -79,6 +79,8 @@ function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Wor
                       (1, 1), # stride
                       (0, 2), # pad
                       (1, 1), # dilation
+                      1 # groups
+
             )
 
     BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, conv1, W_Char_Embed, W_word_Embed,
@@ -100,7 +102,7 @@ function (a::BiLSTM_CNN_CRF_Model)(x)
     oh_outs = viterbi_decode(a.c, m(x), a.init_α)
     Flux.reset!(a.backward)
     Flux.reset!(a.forward_lstm)
-    [a.labels[oh.ix] for oh in oh_outs]
+    [a.labels[oh.indices] for oh in oh_outs]
 end
 
 onehotinput(m::BiLSTM_CNN_CRF_Model, word) = (onehot(get(m.words_idx, lowercase(word), m.UNK_Word_idx), 1:length(m.words_idx)),
diff --git a/test/crf.jl b/test/crf.jl
index 34237d2..a548a4b 100644
--- a/test/crf.jl
+++ b/test/crf.jl
@@ -1,5 +1,5 @@
 using Flux
-using Flux: gradient, LSTM, Dense, reset!, onehot, RNN
+using Flux: gradient, LSTM, Dense, reset!, onehot, RNN, params
 using TextModels: score_sequence, forward_score
 
 @testset "crf" begin
@@ -108,7 +108,7 @@ using TextModels: score_sequence, forward_score
         init_α = fill(-10000, (c.n + 2, 1))
         init_α[c.n + 1] = 0
 
-        loss(xs, ys) = crf_loss(c, m(xs), ys, init_α)
+        loss(xs, ys) = crf_loss(c, m(xs), ys, init_α) + 1e-4*sum(c.W.*c.W)
 
         opt = Descent(0.01)
         data = zip(X, Y)
@@ -117,29 +117,29 @@ using TextModels: score_sequence, forward_score
 
         function train()
             for d in data
-                reset!(lstm)
-                grads = Tracker.gradient(() -> loss(d[1], d[2]), ps)
+                Flux.reset!(lstm)
+                grads = gradient(() -> loss(d[1], d[2]), ps)
                 Flux.Optimise.update!(opt, ps, grads)
             end
         end
 
         function find_loss(d)
-            reset!(lstm)
+            Flux.reset!(lstm)
             loss(d[1], d[2])
         end
         to_sum = [find_loss(d) for d in data]
         l1 = sum(to_sum)
-        dense_param_1 = deepcopy(Tracker.data(d_out.W))
-        lstm_param_1 = deepcopy(Tracker.data(lstm.cell.Wh))
-        crf_param_1 = deepcopy(Tracker.data(c.W))
+        dense_param_1 = deepcopy(d_out.W)
+        lstm_param_1 = deepcopy(lstm.cell.Wh)
+        crf_param_1 = deepcopy(c.W)
 
         for i in 1:10
             train()
         end
 
-        dense_param_2 = deepcopy(Tracker.data(d_out.W))
-        lstm_param_2 = deepcopy(Tracker.data(lstm.cell.Wh))
-        crf_param_2 = deepcopy(Tracker.data(c.W))
+        dense_param_2 = deepcopy(d_out.W)
+        lstm_param_2 = deepcopy(lstm.cell.Wh)
+        crf_param_2 = deepcopy(c.W)
         l2 = sum([find_loss(d) for d in data])
 
         @test l1 > l2
@@ -148,3 +148,4 @@ using TextModels: score_sequence, forward_score
         @test crf_param_1 != crf_param_2
     end
 end
+
diff --git a/test/runtests.jl b/test/runtests.jl
index 1bcac94..2738bfa 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,6 +7,6 @@ println("Running tests:")
 include("crf.jl")
 include("ner.jl")
 include("pos.jl")
+include("sentiment.jl")
 include("averagePerceptronTagger.jl")
 include("ulmfit.jl")
-include("sentiment.jl")
diff --git a/test/ulmfit.jl b/test/ulmfit.jl
index 8ea0092..3deca62 100644
--- a/test/ulmfit.jl
+++ b/test/ulmfit.jl
@@ -4,7 +4,7 @@ using BSON
 @testset "Custom layers" begin
     @testset "WeightDroppedLSTM" begin
         wd = ULMFiT.WeightDroppedLSTM(4, 5, 0.3)
-        @test all(wd.init .== wd.state)
+        @test all((wd.cell.h, wd.cell.c) .== wd.state)
         @test size(wd.cell.Wi) == size(wd.cell.maskWi)
         @test size(wd.cell.Wh) == size(wd.cell.maskWh)
         @test wd.cell.active
@@ -31,10 +31,10 @@ using BSON
         ULMFiT.asgd_step!(4, awd)
         @test length(awd.accum) == 3
         temp = deepcopy(awd.accum[1][1])
-        @test temp == Tracker.data(awd.layer.cell.Wi[1])
+        @test temp == awd.layer.cell.Wi[1]
         ULMFiT.asgd_step!(5, awd)
         temp += temp
-        @test temp == Tracker.data(awd.accum[1][1])
+        @test temp == awd.accum[1][1]
         @test length(params(awd)) == 5
     end
 
@@ -95,6 +95,12 @@ end
     @test length(ULMFiT.get_trainable_params(lm.layers)) == 10
 
     pretrained_weights = BSON.load(datadep"Pretrained ULMFiT Language Model/ulmfit_lm_en.bson")
+    # reshape weights of (h, c) 
+    layers = [5, 6, 10, 11, 15, 16]
+    for i in layers 
+       pretrained_weights[:weights][i] = reshape(pretrained_weights[:weights][i], length(pretrained_weights[:weights][i]), 1)
+    end
+
     @test length(pretrained_weights[:weights]) == 16
     @test all(size.(params(lm)) .== size.(pretrained_weights[:weights]))
 end