diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..8e04faf --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,64 @@ +name: CI +on: + push: + branches: + - master + tags: '*' + pull_request: +jobs: + test: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} + runs-on: ${{ matrix.os }} + continue-on-error: ${{ matrix.version == 'nightly' }} + strategy: + matrix: + version: + - '1.6' + - 'nightly' + os: + - ubuntu-latest + - macOS-latest + - windows-latest + arch: + - x86 + - x64 + exclude: + # Remove some configurations from the build matrix to reduce CI time. + # See https://github.com/marketplace/actions/setup-julia-environment + # MacOS not available on x86 + - {os: 'macOS-latest', arch: 'x86'} + # Don't test on all versions + - {os: 'macOS-latest', version: '1.6'} + - {os: 'macOS-latest', version: 'nightly'} + - {os: 'windows-latest', version: '1.6'} + - {os: 'windows-latest', version: 'nightly'} + - {os: 'windows-latest', arch: 'x86'} + - {arch: 'x86', version: '1.6'} + - {arch: 'x86', version: 'nightly'} + steps: + - uses: actions/checkout@v1 + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/julia-buildpkg@latest + - uses: julia-actions/julia-runtest@latest + env: + DATADEPS_ALWAYS_ACCEPT: true + with: + coverage: false + docs: + name: Documentation + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - uses: julia-actions/setup-julia@latest + with: + version: '1.6' + - run: julia --project=docs -e ' + using Pkg; + Pkg.develop(PackageSpec(; path=pwd())); + Pkg.instantiate();' + - run: julia --project=docs docs/make.jl + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.travis.yml b/.travis.yml index bf028c9..8e8320a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,17 +6,16 @@ os: env: - DATADEPS_ALWAYS_ACCEPT=true julia: - - 1.3 - - 1 + - 1.6 - nightly matrix: allow_failures: - julia: nightly exclude: - os: osx - julia: 1.3 + julia: 1.6 - os: windows - julia: 1.3 + julia: 1.6 - os: osx julia: nightly - os: windows diff --git a/Project.toml b/Project.toml index 4687488..bbe2f84 100644 --- a/Project.toml +++ b/Project.toml @@ -2,40 +2,42 @@ name = "TextModels" uuid = "77b9cbda-2a23-51df-82a3-24144d1cd378" license = "MIT" desc = "Practical Neural Network based models for Natural Language Processing" -version = "0.1.0" +version = "0.1.1" [deps] BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8" DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d" -Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" +Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] -BSON = "0.2.5" +BSON = "0.3.3" +CUDA = "3" +CorpusLoaders = "0.3" DataDeps = "0.7" -DataStructures = "0.17, 0.18" -Flux = "0.9" -JSON = "0.21" -Languages = "0.4" -NNlib = "0.6, 0.7" -StatsBase = "0.33" -TextAnalysis = "0.7" -Tracker = "0.2" -WordTokenizers = "0.5" -julia = "1.3" +DataStructures = "0.18.9" +Flux = "0.12.8" +JSON = "0.21.1" +Languages = "0.4.3" +NNlib = "0.7" +StatsBase = "0.33.6" +TextAnalysis = "0.7.3" +WordTokenizers = "0.5.6" +Zygote = "0.6.10" +julia = "1.6" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/README.md b/README.md index afa0f57..4b63f6e 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ A Julia package for working with text. ## Introduction -The TextModels package enhances the TextAnalysis package with end-user focussed, practical natural language models, typically based on neural networks (in this case, [Flux](https://fluxml.ai/)). Please see the [documentation](https://juliatext.github.io/TextAnalysis.jl/latest) for more. +The TextModels package enhances the TextAnalysis package with end-user focussed, practical natural language models, typically based on neural networks (in this case, [Flux](https://fluxml.ai/)). Please see the [documentation](https://juliahub.com/docs/TextModels) for more. - **License** : [MIT License](https://github.com/JuliaText/TextAnalysis.jl/blob/master/LICENSE.md) diff --git a/docs/make.jl b/docs/make.jl index 5876f79..ca1f9ea 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -2,14 +2,16 @@ using Documenter, TextModels makedocs( modules = [TextModels], - sitename = "TextAnalysis", + sitename = "TextModels", format = Documenter.HTML( ), pages = [ "Home" => "index.md", "Conditional Random Fields" => "crf.md", - "Named Entity Recognition" => "ner.md", "ULMFiT" => "ULMFiT.md", + "Named Entity Recognition" => "ner.md", + "Tagging Schemes" => "tagging.md", + "Sentiment Analyzer" => "sentiment.md", "API References" => "APIReference.md" ], ) diff --git a/docs/src/ULMFiT.md b/docs/src/ULMFiT.md index 332e2fd..89622d1 100644 --- a/docs/src/ULMFiT.md +++ b/docs/src/ULMFiT.md @@ -18,37 +18,38 @@ Default data loaders are provided in the `data_loaders.jl`: In this step, Language Model will learn the general properties of the Language. To train the model we need a general domain corpus like WikiText-103. For training, a `generator` function is provided to create a `Channel` which will give mini-batch in every call. After pre-processing the corpus, the tokenized corpus is given as input to the generator function and the Channel can be created like so: ```julia -julia> loader = Channel(x -> generator(x, corpus; batchsize=4, bptt=10)) -Channel{Any}(sz_max:0,sz_curr:1) +julia> loader = ULMFiT.imdb_fine_tune_data(4, 10) # batchsize=4, bptt=10 +Channel{Any}(0) (1 item available) julia> max_batches = take!(loader) # this is the first call to the loader # These are the subsequent calls in pairs for X and Y -julia> X = take!(Loaders) - 10-element Array{Array{Any,1},1}: - ["senjō", ",", "indicated", "after"] - ["no", "he", ",", "two"] - ["valkyria", "sent", "\"", "games"] - ["3", "a", "i", ","] - [":", "formal", "am", "making"] - ["", "demand", "to", "a"] - ["chronicles", "for", "some", "start"] - ["(", "surrender", "extent", "against"] - ["japanese", "of", "influenced", "the"] - [":", "the", "by", "vancouver"] - -julia> Y = take!(gen) -10-element Array{Array{Any,1},1}: -["no", "he", ",", "two"] -["valkyria", "sent", "\"", "games"] -["3", "a", "i", ","] -[":", "formal", "am", "making"] -["", "demand", "to", "a"] -["chronicles", "for", "some", "start"] -["(", "surrender", "extent", "against"] -["japanese", "of", "influenced", "the"] -[":", "the", "by", "vancouver"] -["戦場のヴァルキュリア", "arsenal", "them", "canucks"] +julia> X = take!(loader) +10-element Vector{Vector{Any}}: + ["i", "transparent", "it", "were"] + ["admit", "villain", "immediately", "all"] + [",", "who", "as", "first"] + ["the", "talks", "she", "rate"] + ["great", "like", "is", "."] + ["majority", "mortimer", "on", "even"] + ["of", "snerd", "for", "veda"] + ["films", "and", "a", "ann"] + ["released", "has", "few", "borg"] + ["before", "an", "seconds", "in"] + +julia> Y = take!(loader) +10-element Vector{Vector{Any}}: + ["admit", "villain", "immediately", "all"] + [",", "who", "as", "first"] + ["the", "talks", "she", "rate"] + ["great", "like", "is", "."] + ["majority", "mortimer", "on", "even"] + ["of", "snerd", "for", "veda"] + ["films", "and", "a", "ann"] + ["released", "has", "few", "borg"] + ["before", "an", "seconds", "in"] + ["say", "office", ",", "a"] + ``` Note that at the first call to this `Channel` the output will be maximum number of batches which it can give. Two calls to this `Channel` completed one batch, that is, it doesnot give `X` and `Y` both together in one call, two calls are needed, one first `X` is given out and in second `Y`. Also, to understand what are `batchsize` and `bptt`, refer this [blog](https://nextjournal.com/ComputerMaestro/jsoc19-practical-implementation-of-ulmfit-in-julia-2). @@ -199,24 +200,24 @@ This is basically a modification to the original LSTM layer. The layer uses [Dro ```julia # maskWi and maskWh are drop masks for Wi and Wh weights -julia> fieldnames(WeightDroppedLSTMCell) +julia> fieldnames(ULMFiT.WeightDroppedLSTMCell) (:Wi, :Wh, :b, :h, :c, :p, :maskWi, :maskWh, :active) # To deine a layer with 4 input size and 5 output size and 0.3 dropping probability -julia> wd = WeightDroppedLSTM(4, 5, 0.3); +julia> wd = ULMFiT.WeightDroppedLSTM(4, 5, 0.3); # Pass julia> x = rand(4); julia> h = wd(x) -Tracked 5-element Array{Float64,1}: - 0.06149460838123775 - -0.06028818475111407 - 0.07400426274491535 - -0.20671647527394219 - -0.00678279380721769 +5×1 Matrix{Float64}: + 0.17602923394922002 + 0.08615001440875035 + 0.015924513976372016 + 0.10526862977034518 + -0.04417581280319146 # To reset_masks! -julia> reset_masks!(wd) +julia> ULMFiT.reset_masks!(wd) ``` ### Averaged-SGD LSTM (AWD_LSTM) @@ -226,63 +227,63 @@ This is a regular LSTM layer with Variational DropConnect and weights averaging ```julia # `accum` field is used to store the sum of weights for every iteration after trigger # to get average of the weights for every subsequent iteration -julia> fieldnames(AWD_LSTM) +julia> fieldnames(ULMFiT.AWD_LSTM) (:layer, :T, :accum) -julia> awd = AWD_LSTM(3, 4, 0.5) +julia> awd = ULMFiT.AWD_LSTM(3, 4, 0.5) # Setting trigger iteration -julia> set_trigger!(1000, awd) +julia> ULMFiT.set_trigger!(1000, awd) julia> awd.T 1000 # Pass -julia> x = rand(3) +julia> x = rand(3); julia> h = awd(x) -Tracked 4-element Array{Float64,1}: - -0.0751824486756288 - -0.3061227967356536 - -0.030079860137667995 - -0.09833401074779546 +4×1 Matrix{Float64}: + 0.15229648590284084 + -0.05929450272853615 + -0.06110043118692251 + 0.15302430271141032 # Resetting drop masks - julia> awd.layer.cell.maskWi - 16×3 Array{Float32,2}: - 0.0 2.0 2.0 - 2.0 2.0 2.0 +julia> awd.layer.cell.maskWi +16×3 Matrix{Float32}: + 0.0 0.0 0.0 + 2.0 0.0 0.0 0.0 2.0 0.0 - 0.0 0.0 2.0 - 0.0 0.0 2.0 - 2.0 2.0 2.0 + 0.0 0.0 0.0 2.0 2.0 2.0 - 0.0 2.0 2.0 0.0 2.0 0.0 2.0 0.0 2.0 + 2.0 2.0 2.0 + 2.0 0.0 0.0 0.0 0.0 2.0 - 0.0 2.0 2.0 + 2.0 0.0 0.0 2.0 0.0 2.0 0.0 2.0 0.0 0.0 2.0 0.0 - 2.0 0.0 2.0 + 2.0 2.0 2.0 + 2.0 2.0 2.0 - julia> reset_masks!(awd) - julia> awd.layer.cell.maskWi - 16×3 Array{Float32,2}: +julia> ULMFiT.reset_masks!(awd) +julia> awd.layer.cell.maskWi +16×3 Matrix{Float32}: 0.0 2.0 0.0 - 0.0 0.0 0.0 - 2.0 0.0 0.0 0.0 2.0 0.0 + 0.0 0.0 0.0 2.0 2.0 0.0 2.0 2.0 2.0 - 2.0 2.0 0.0 - 2.0 2.0 0.0 2.0 2.0 2.0 + 0.0 2.0 0.0 + 2.0 2.0 0.0 + 2.0 0.0 2.0 0.0 0.0 2.0 2.0 0.0 0.0 2.0 2.0 2.0 - 2.0 2.0 2.0 0.0 0.0 2.0 - 0.0 2.0 0.0 + 0.0 2.0 2.0 + 2.0 0.0 2.0 0.0 0.0 2.0 ``` @@ -291,33 +292,34 @@ Tracked 4-element Array{Float64,1}: This layer applis Variational-DropOut, which is, using same dropout mask till it is not specified to change or till a pass is over. This dropout is useful for recurrent layers since these layers perform better if same mask is used for all time-steps (pass) instead of using different for every timestep. [Refer [this](https://arxiv.org/pdf/1506.02557.pdf) paper for more details]. This layer saves the masks after generation till it is not specified to change. To change the mask use `reset_masks!` function. ```julia -julia> vd = VarDrop(0.5) -VarDrop{Float64}(0.5, Array{Float32}(0,0), true, true) +julia> vd = ULMFiT.VarDrop(0.5) +VarDrop{Float64}(0.5, Matrix{Float32}(undef, 0, 0), true, true) # No mask generation will nothing is passed julia> vd.mask -0×0 Array{Float32,2} +0×0 Matrix{Float32} julia> x = rand(4,5) -4×5 Array{Float64,2}: - 0.480531 0.556341 0.228134 0.439411 0.137296 - 0.541459 0.118603 0.448941 0.568478 0.0440091 - 0.491735 0.55232 0.857768 0.729287 0.842753 - 0.33523 0.0378036 0.491757 0.00710462 0.374096 - - julia> x = vd(x) - 4×5 Array{Float64,2}: - 0.961062 1.11268 0.0 0.0 0.274592 - 1.08292 0.0 0.897881 0.0 0.0880182 - 0.98347 0.0 0.0 1.45857 1.68551 - 0.67046 0.0756071 0.983514 0.0142092 0.0 - - julia> vd.mask - 4×5 Array{Float64,2}: - 2.0 2.0 0.0 0.0 2.0 - 2.0 0.0 2.0 0.0 2.0 - 2.0 0.0 0.0 2.0 2.0 - 2.0 2.0 2.0 2.0 0.0 +4×5 Matrix{Float64}: + 0.383492 0.914917 0.616324 0.940116 0.526015 + 0.286494 0.35078 0.320465 0.334261 0.295965 + 0.232206 0.26289 0.940569 0.23259 0.675406 + 0.152903 0.934304 0.125803 0.727792 0.239359 + +julia> x = vd(x) +4×5 Matrix{Float64}: + 0.0 0.0 0.0 1.88023 1.05203 + 0.0 0.0 0.64093 0.668522 0.591929 + 0.464413 0.0 1.88114 0.0 0.0 + 0.0 0.0 0.0 0.0 0.478717 + +julia> vd.mask +4×5 Matrix{Float64}: + 0.0 0.0 0.0 2.0 2.0 + 0.0 0.0 2.0 2.0 2.0 + 2.0 0.0 2.0 0.0 0.0 + 0.0 0.0 0.0 0.0 2.0 + ``` ### Dropped Embeddings (DroppedEmbeddings) @@ -325,35 +327,35 @@ julia> x = rand(4,5) This layer is an embedding layer which can work in two ways either to give embeddings Vectors for the given indices of words in vocabulary or can be used to get probability distribution for all the words of vocabulary with softmax layer, which is also called as weight-tying. Here, it can be used to tie weights of the embedding layer and the last softmax layer. In addition to this, it also dropped embeddings for words randomly for given probability of dropping, in other words, it puts whole embedding vector of randomly selects to vector of zeros. Here, the mask used for the dropping posses variational property, that is, it cannot be changed till it is not specified to change or generate a new drop mask. `reset_masks!` should be used to reset the mask. ```julia -julia> fieldnames(DroppedEmbeddings) +julia> fieldnames(ULMFiT.DroppedEmbeddings) (:emb, :p, :mask, :active) -julia> de = DroppedEmbeddings(5, 2, 0.3) +julia> de = ULMFiT.DroppedEmbeddings(5, 2, 0.3); # Pass -julia> x = [4,2,1] +julia> x = [4,2,1]; julia> embeddings = de(x) -Tracked 2×3 LinearAlgebra.Transpose{Float32,Array{Float32,2}}: - 0.86327 0.537614 -0.0 - 0.152131 -0.541008 -0.0 +2×3 transpose(::Matrix{Float32}) with eltype Float32: + 0.363157 -0.0246867 -0.332342 + -0.553211 -0.594884 0.184288 - julia> de.mask - 5-element Array{Float32,1}: - 0.0 +julia> de.mask +5-element Vector{Float32}: + 1.4285715 1.4285715 1.4285715 1.4285715 1.4285715 - # reset mask - julia> reset_masks!(de) - julia> de.mask - 5-element Array{Float32,1}: - 0.0 +# reset mask +julia> reset_masks!(de) +julia> de.mask +5-element Vector{Float32}: 1.4285715 1.4285715 - 0.0 1.4285715 + 0.0 + 0.0 ``` ### Concat-Pooled Dense layer @@ -362,13 +364,13 @@ This is a simple modification to the original `Dense` layer for recurrent networ ```julia # The first argument is the length of the output Vector of the preceding RNN layer to this layer. Also, by default if uses identity activation, it can be changed by giving desired activaiton as the third argument -julia> pd = PooledDense(4, 3) +julia> pd = ULMFiT.PooledDense(4, 3) # Pass -julia> X = [rand(4), rand(4), rand(4)] +julia> X = [rand(4), rand(4), rand(4)]; julia> pd(X) -Tracked 3×1 Array{Float64,2}: - -2.2106991143006036 - -0.9560163708455404 - -0.4770649645417375 +3×1 Matrix{Float64}: + -1.3679283360573462 + 1.1115990254044759 + -0.27398355913859046 ``` diff --git a/docs/src/crf.md b/docs/src/crf.md index 19f958d..af93cbf 100644 --- a/docs/src/crf.md +++ b/docs/src/crf.md @@ -6,18 +6,19 @@ Let us first load the dependencies- using Flux using Flux: onehot, train!, Params, gradient, LSTM, Dense, reset! - using TextAnalysis: CRF, viterbi_decode, crf_loss + using TextModels: CRF, viterbi_decode, crf_loss Conditional Random Field layer is essentially like a softmax that operates on the top most layer. Let us suppose the following input sequence to the CRF with `NUM_LABELS = 2` ```julia +julia> NUM_LABELS = 2 julia> SEQUENCE_LENGTH = 2 # CRFs can handle variable length inputs sequences -julia> input_seq = [rand(NUM_LABELS + 2) for i in 1:SEQUENCE_LENGTH] # NUM_LABELS + 2, where two extra features correspond to the :START and :END label. -2-element Array{Array{Float64,1},1}: - [0.523462, 0.455434, 0.274347, 0.755279] - [0.610991, 0.315381, 0.0863632, 0.693031] +julia> input_seq = [Float32.(rand(NUM_LABELS + 2)) for i in 1:SEQUENCE_LENGTH] # NUM_LABELS + 2, where two extra features correspond to the :START and :END label. +2-element Vector{Vector{Float32}}: + [0.5114323, 0.5355139, 0.4011792, 0.56359255] + [0.22925346, 0.21232551, 0.77616125, 0.41560093] ``` @@ -56,16 +57,16 @@ julia> label_seq3 = [onehot(2, 1:2), onehot(1, 1:2)] julia> label_seq4 = [onehot(2, 1:2), onehot(2, 1:2)] julia> crf_loss(c, input_seq, label_seq1, init_α) -1.9206894963901504 (tracked) +1.33554f0 julia> crf_loss(c, input_seq, label_seq2, init_α) -1.4972745472075206 (tracked) +1.2327178f0 julia> crf_loss(c, input_seq, label_seq3, init_α) -1.543210471592448 (tracked) +1.3454239f0 julia> crf_loss(c, input_seq, label_seq4, init_α) -0.876923329893466 (tracked) +1.6871009f0 ``` @@ -75,9 +76,9 @@ We can decode this using Viterbi Decode. ```julia julia> viterbi_decode(c, input_seq, init_α) # Gives the label_sequence with least loss -2-element Array{Flux.OneHotVector,1}: - [false, true] - [false, true] +2-element Vector{Flux.OneHotArray{UInt32, 2, 0, 1, UInt32}}: + [1, 0] + [0, 1] ``` @@ -96,7 +97,7 @@ CRFs smoothly work over Flux layers- julia> NUM_FEATURES = 20 julia> input_seq = [rand(NUM_FEATURES) for i in 1:SEQUENCE_LENGTH] -2-element Array{Array{Float64,1},1}: +2-element Vector{Vector{Float32}}: [0.948219, 0.719964, 0.352734, 0.0677656, 0.570564, 0.187673, 0.525125, 0.787807, 0.262452, 0.472472, 0.573259, 0.643369, 0.00592054, 0.945258, 0.951466, 0.323156, 0.679573, 0.663285, 0.218595, 0.152846] [0.433295, 0.11998, 0.99615, 0.530107, 0.188887, 0.897213, 0.993726, 0.0799431, 0.953333, 0.941808, 0.982638, 0.0919345, 0.27504, 0.894169, 0.66818, 0.449537, 0.93063, 0.384957, 0.415114, 0.212203] @@ -105,7 +106,7 @@ julia> m1 = Dense(NUM_FEATURES, NUM_LABELS + 2) julia> loss1(input_seq, label_seq) = crf_loss(c, m1.(input_seq), label_seq, init_α) # loss for model m1 julia> loss1(input_seq, [onehot(1, 1:2), onehot(1, 1:2)]) -4.6620379898687485 (tracked) +4.6620379898687485 ``` @@ -124,7 +125,7 @@ julia> m2(x) = dense_out.(lstm.(x)) julia> loss2(input_seq, label_seq) = crf_loss(c, m2(input_seq), label_seq, init_α) # loss for model m2 julia> loss2(input_seq, [onehot(1, 1:2), onehot(1, 1:2)]) -1.6501050910529504 (tracked) +1.6501050910529504 julia> reset!(lstm) ``` diff --git a/docs/src/index.md b/docs/src/index.md index 2168e45..8c36217 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -2,6 +2,8 @@ The TextModels package enhances the TextAnalysis package with end-user focussed, practical natural language models, typically based on neural networks (in this case, [Flux](https://fluxml.ai/)) +This package depends on the [TextAnalysis](https://github.com/JuliaText/TextAnalysis.jl) package, which contains basic algorithms to deal with textual documetns. + ## Installation The TextModels package can be installed using Julia's package manager: diff --git a/docs/src/sentiment.md b/docs/src/sentiment.md new file mode 100644 index 0000000..e2cfe57 --- /dev/null +++ b/docs/src/sentiment.md @@ -0,0 +1,41 @@ +## Sentiment Analyzer + +It can be used to find the sentiment score (between 0 and 1) of a word, sentence or a Document. +A trained model (using Flux) on IMDB word corpus with weights saved are used to calculate the sentiments. + + model = SentimentAnalyzer() + model(doc) + model(doc, handle_unknown) + +* doc = Input Document for calculating document (AbstractDocument type) +* handle_unknown = A function for handling unknown words. Should return an array (default (x)->[]) + +```julia +julia> using TextAnalysis + +julia> m = SentimentAnalyzer() +Sentiment Analysis Model Trained on IMDB with a 88587 word corpus + +julia> d1 = StringDocument("a very nice thing that everyone likes") +A StringDocument{String} + * Language: Languages.English() + * Title: Untitled Document + * Author: Unknown Author + * Timestamp: Unknown Time + * Snippet: a very nice thing that everyone likes + +julia> m(d1) +0.5183109f0 + +julia> d = StringDocument("a horrible thing that everyone hates") +A StringDocument{String} + * Language: Languages.English() + * Title: Untitled Document + * Author: Unknown Author + * Timestamp: Unknown Time + * Snippet: a horrible thing that everyone hates + +julia> m(d2) +0.47193584f0 + +``` diff --git a/docs/src/tagging.md b/docs/src/tagging.md new file mode 100644 index 0000000..90d85cf --- /dev/null +++ b/docs/src/tagging.md @@ -0,0 +1,237 @@ +## Tagging_schemes + +There are many tagging schemes used for sequence labelling. +TextAnalysis currently offers functions for conversion between these tagging format. + +* BIO1 +* BIO2 +* BIOES + +```julia +julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"] + +julia> tag_scheme!(tags, "BIO1", "BIOES") + +julia> tags +8-element Array{String,1}: + "S-LOC" + "O" + "S-PER" + "B-MISC" + "E-MISC" + "B-PER" + "I-PER" + "E-PER" +``` + +## Parts of Speech Tagging + +This package provides with two different Part of Speech Tagger. + +## Average Perceptron Part of Speech Tagger + +This tagger can be used to find the POS tag of a word or token in a given sentence. It is a based on `Average Perceptron Algorithm`. +The model can be trained from scratch and weights are saved in specified location. +The pretrained model can also be loaded and can be used directly to predict tags. + +### To train model: +```julia +julia> tagger = PerceptronTagger(false) #we can use tagger = PerceptronTagger() +julia> fit!(tagger, [[("today","NN"),("is","VBZ"),("good","JJ"),("day","NN")]]) +iteration : 1 +iteration : 2 +iteration : 3 +iteration : 4 +iteration : 5 +``` + +### To load pretrained model: +```julia +julia> tagger = PerceptronTagger(true) +loaded successfully +PerceptronTagger(AveragePerceptron(Set(Any["JJS", "NNP_VBZ", "NN_NNS", "CC", "NNP_NNS", "EX", "NNP_TO", "VBD_DT", "LS", ("Council", "NNP") … "NNPS", "NNP_LS", "VB", "NNS_NN", "NNP_SYM", "VBZ", "VBZ_JJ", "UH", "SYM", "NNP_NN", "CD"]), Dict{Any,Any}("i+2 word wetlands"=>Dict{Any,Any}("NNS"=>0.0,"JJ"=>0.0,"NN"=>0.0),"i-1 tag+i word NNP basic"=>Dict{Any,Any}("JJ"=>0.0,"IN"=>0.0),"i-1 tag+i word DT chloride"=>Dict{Any,Any}("JJ"=>0.0,"NN"=>0.0),"i-1 tag+i word NN choo"=>Dict{Any,Any}("NNP"=>0.0,"NN"=>0.0),"i+1 word antarctica"=>Dict{Any,Any}("FW"=>0.0,"NN"=>0.0),"i-1 tag+i word -START- appendix"=>Dict{Any,Any}("NNP"=>0.0,"NNPS"=>0.0,"NN"=>0.0),"i-1 word wahoo"=>Dict{Any,Any}("JJ"=>0.0,"VBD"=>0.0),"i-1 tag+i word DT children's"=>Dict{Any,Any}("NNS"=>0.0,"NN"=>0.0),"i word dnipropetrovsk"=>Dict{Any,Any}("NNP"=>0.003,"NN"=>-0.003),"i suffix hla"=>Dict{Any,Any}("JJ"=>0.0,"NN"=>0.0)…), DefaultDict{Any,Any,Int64}(), DefaultDict{Any,Any,Int64}(), 1, ["-START-", "-START2-"]), Dict{Any,Any}("is"=>"VBZ","at"=>"IN","a"=>"DT","and"=>"CC","for"=>"IN","by"=>"IN","Retrieved"=>"VBN","was"=>"VBD","He"=>"PRP","in"=>"IN"…), Set(Any["JJS", "NNP_VBZ", "NN_NNS", "CC", "NNP_NNS", "EX", "NNP_TO", "VBD_DT", "LS", ("Council", "NNP") … "NNPS", "NNP_LS", "VB", "NNS_NN", "NNP_SYM", "VBZ", "VBZ_JJ", "UH", "SYM", "NNP_NN", "CD"]), ["-START-", "-START2-"], ["-END-", "-END2-"], Any[]) +``` + +### To predict tags: + +The perceptron tagger can predict tags over various document types- + + predict(tagger, sentence::String) + predict(tagger, Tokens::Array{String, 1}) + predict(tagger, sd::StringDocument) + predict(tagger, fd::FileDocument) + predict(tagger, td::TokenDocument) + +This can also be done by - + tagger(input) + + +```julia +julia> predict(tagger, ["today", "is"]) +2-element Array{Any,1}: + ("today", "NN") + ("is", "VBZ") + +julia> tagger(["today", "is"]) +2-element Array{Any,1}: + ("today", "NN") + ("is", "VBZ") +``` + +`PerceptronTagger(load::Bool)` + +* load = Boolean argument if `true` then pretrained model is loaded + +`fit!(self::PerceptronTagger, sentences::Vector{Vector{Tuple{String, String}}}, save_loc::String, nr_iter::Integer)` + +* self = `PerceptronTagger` object +* sentences = `Vector` of `Vector` of `Tuple` of pair of word or token and its POS tag [see above example] +* save_loc = location of file to save the trained weights +* nr_iter = Number of iterations to pass the `sentences` to train the model ( default 5) + +`predict(self::PerceptronTagger, tokens)` + +* self = PerceptronTagger +* tokens = `Vector` of words or tokens for which to predict tags + +## Neural Model for Part of Speech tagging using LSTMs, CNN and CRF + +The API provided is a pretrained model for tagging Part of Speech. +The current model tags all the POS Tagging is done based on [convention used in Penn Treebank](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html), with 36 different Part of Speech tags excludes punctuation. + +To use the API, we first load the model weights into an instance of tagger. +The function also accepts the path of model_weights and model_dicts (for character and word embeddings) + + PoSTagger() + PoSTagger(dicts_path, weights_path) + +```julia +julia> pos = PoSTagger() + +``` + +!!! note + When you call `PoSTagger()` for the first time, the package will request permission for download the `Model_dicts` and `Model_weights`. Upon downloading, these are store locally and managed by `DataDeps`. So, on subsequent uses the weights will not need to be downloaded again. + +Once we create an instance, we can call it to tag a String (sentence), sequence of tokens, `AbstractDocument` or `Corpus`. + + (pos::PoSTagger)(sentence::String) + (pos::PoSTagger)(tokens::Array{String, 1}) + (pos::PoSTagger)(sd::StringDocument) + (pos::PoSTagger)(fd::FileDocument) + (pos::PoSTagger)(td::TokenDocument) + (pos::PoSTagger)(crps::Corpus) + +```julia + +julia> sentence = "This package is maintained by John Doe." +"This package is maintained by John Doe." + +julia> tags = pos(sentence) +8-element Array{String,1}: + "DT" + "NN" + "VBZ" + "VBN" + "IN" + "NNP" + "NNP" + "." + +``` + +The API tokenizes the input sentences via the default tokenizer provided by `WordTokenizers`, this currently being set to the multilingual `TokTok Tokenizer.` + +``` + +julia> using WordTokenizers + +julia> collect(zip(WordTokenizers.tokenize(sentence), tags)) +8-element Array{Tuple{String,String},1}: + ("This", "DT") + ("package", "NN") + ("is", "VBZ") + ("maintained", "VBN") + ("by", "IN") + ("John", "NNP") + ("Doe", "NNP") + (".", ".") + +``` + +For tagging a multisentence text or document, once can use `split_sentences` from `WordTokenizers.jl` package and run the pos model on each. + +```julia +julia> sentences = "Rabinov is winding up his term as ambassador. He will be replaced by Eliahu Ben-Elissar, a former Israeli envoy to Egypt and right-wing Likud party politiian." # Sentence taken from CoNLL 2003 Dataset + +julia> splitted_sents = WordTokenizers.split_sentences(sentences) + +julia> tag_sequences = pos.(splitted_sents) +2-element Array{Array{String,1},1}: + ["NNP", "VBZ", "VBG", "RP", "PRP\$", "NN", "IN", "NN", "."] + ["PRP", "MD", "VB", "VBN", "IN", "NNP", "NNP", ",", "DT", "JJ", "JJ", "NN", "TO", "NNP", "CC", "JJ", "NNP", "NNP", "NNP", "."] + +julia> zipped = [collect(zip(tag_sequences[i], WordTokenizers.tokenize(splitted_sents[i]))) for i in eachindex(splitted_sents)] + +julia> zipped[1] +9-element Array{Tuple{String,String},1}: + ("NNP", "Rabinov") + ("VBZ", "is") + ("VBG", "winding") + ("RP", "up") + ("PRP\$", "his") + ("NN", "term") + ("IN", "as") + ("NN", "ambassador") + (".", ".") + +julia> zipped[2] +20-element Array{Tuple{String,String},1}: + ("PRP", "He") + ("MD", "will") + ("VB", "be") + ("VBN", "replaced") + ("IN", "by") + ("NNP", "Eliahu") + ("NNP", "Ben-Elissar") + (",", ",") + ("DT", "a") + ("JJ", "former") + ("JJ", "Israeli") + ("NN", "envoy") + ("TO", "to") + ("NNP", "Egypt") + ("CC", "and") + ("JJ", "right-wing") + ("NNP", "Likud") + ("NNP", "party") + ("NNP", "politiian") + (".", ".") + +``` + +Since the tagging the Part of Speech is done on sentence level, +the text of `AbstractDocument` is sentence_tokenized and then labelled for over sentence. +However is not possible for `NGramDocument` as text cannot be recreated. +For `TokenDocument`, text is approximated for splitting into sentences, hence the following throws a warning when tagging the `Corpus`. + +```julia + +julia> crps = Corpus([StringDocument("We aRE vErY ClOSE tO ThE HEaDQuarTeRS."), TokenDocument("this is Bangalore.")]) +A Corpus with 2 documents: + * 1 StringDocument's + * 0 FileDocument's + * 1 TokenDocument's + * 0 NGramDocument's + +Corpus's lexicon contains 0 tokens +Corpus's index contains 0 tokens + +julia> pos(crps) +┌ Warning: TokenDocument's can only approximate the original text +└ @ TextAnalysis ~/.julia/dev/TextAnalysis/src/document.jl:220 +2-element Array{Array{Array{String,1},1},1}: + [["PRP", "VBP", "RB", "JJ", "TO", "DT", "NN", "."]] + [["DT", "VBZ", "NNP", "."]] + +``` diff --git a/src/CRF/crf.jl b/src/CRF/crf.jl index 98ffc23..3145d89 100644 --- a/src/CRF/crf.jl +++ b/src/CRF/crf.jl @@ -22,10 +22,10 @@ function CRF(n::Integer) W[:, n + 1] .= -10000 W[n + 2, :] .= -10000 - return CRF(param(W), n) + return CRF(W, n) end -@treelike CRF +@functor CRF function Base.show(io::IO, c::CRF) print(io, "CRF with ", c.n + 2, " distinct tags (including START and STOP tags).") diff --git a/src/CRF/loss.jl b/src/CRF/loss.jl index 495816d..32501bd 100644 --- a/src/CRF/loss.jl +++ b/src/CRF/loss.jl @@ -5,13 +5,13 @@ Compute the Normalization / partition function or the Forward Algorithm score - `Z` """ function forward_score(c::CRF, x, init_α) - forward_var = log_sum_exp((c.W .+ x[1]') .+ init_α) + forward_var = log_sum_exp(c.W .+ x[1]' .+ init_α) for i in 2:length(x) forward_var = log_sum_exp((c.W .+ x[i]') .+ forward_var') end - return log_sum_exp(c.W[:, c.n + 2] + forward_var')[1] + return log_sum_exp(c.W[:, c.n + 2] .+ forward_var')[1] end """ diff --git a/src/CRF/predict.jl b/src/CRF/predict.jl index 29e2c34..3225b70 100644 --- a/src/CRF/predict.jl +++ b/src/CRF/predict.jl @@ -35,14 +35,14 @@ Computes the forward pass for viterbi algorithm. function _decode(c::CRF, x, init_vit_vars) α_idx = zeros(Int, c.n + 2, length(x)) - forward_var, α_idx[:, 1] = forward_pass_unit(Tracker.data((c.W .+ x[1]') .+ init_vit_vars)) + forward_var, α_idx[:, 1] = forward_pass_unit((c.W .+ x[1]') .+ init_vit_vars) for i in 2:length(x) - forward_var, α_idx[:, i] = forward_pass_unit(Tracker.data((c.W .+ x[i]') .+ forward_var')) + forward_var, α_idx[:, i] = forward_pass_unit((c.W .+ x[i]') .+ forward_var') end labels = zeros(Int, length(x)) - labels[end] = argmax(forward_var + Tracker.data(c.W[:, c.n + 2])')[2] + labels[end] = argmax(forward_var + (c.W[:, c.n + 2])')[2] for i in reverse(2:length(x)) labels[i - 1] = α_idx[labels[i], i] diff --git a/src/TextModels.jl b/src/TextModels.jl index a82ec68..5c88496 100644 --- a/src/TextModels.jl +++ b/src/TextModels.jl @@ -7,8 +7,8 @@ module TextModels using Pkg.Artifacts - using Flux, Tracker - using Flux: identity, onehot, onecold, @treelike, onehotbatch + using Flux, Zygote + using Flux: identity, onehot, onecold, @functor, onehotbatch using TextAnalysis @@ -36,15 +36,17 @@ module TextModels include("sequence/pos_datadeps.jl") include("sequence/pos.jl") include("sequence/sequence_models.jl") - - + + # ULMFiT module ULMFiT - using ..TextAnalysis - using DataDeps + using TextAnalysis using Flux - using Tracker + using Flux:crossentropy + using Zygote using BSON + using CorpusLoaders + using DataDeps include("ULMFiT/utils.jl") include("ULMFiT/datadeps.jl") include("ULMFiT/data_loaders.jl") @@ -60,7 +62,7 @@ module TextModels ner_datadep_register() pos_datadep_register() ULMFiT.ulmfit_datadep_register() - + global sentiment_model = artifact"sentiment_model" end end diff --git a/src/ULMFiT/custom_layers.jl b/src/ULMFiT/custom_layers.jl index e402c7d..d83c43c 100644 --- a/src/ULMFiT/custom_layers.jl +++ b/src/ULMFiT/custom_layers.jl @@ -8,7 +8,7 @@ This file contains the custom layers defined for this model: PooledDense """ -import Flux: gate, _testmode!, _dropout_kernel +import Flux: gate, testmode!, _dropout_kernel reset_masks!(entity) = nothing reset_probability!(entity) = nothing @@ -44,12 +44,12 @@ Moreover this also follows the Vartional DropOut citeria, that is, the drop mask is remains same for a whole training pass. This is done by saving the masks in 'maskWi' and 'maskWh' fields """ -mutable struct WeightDroppedLSTMCell{A, V, M} +mutable struct WeightDroppedLSTMCell{A, V, S, M} Wi::A Wh::A b::V - h::V - c::V + h::S + c::S p::Float64 maskWi::M maskWh::M @@ -60,17 +60,17 @@ function WeightDroppedLSTMCell(in::Integer, out::Integer, p::Float64=0.0; init = Flux.glorot_uniform) @assert 0 ≤ p ≤ 1 cell = WeightDroppedLSTMCell( - param(init(out*4, in)), - param(init(out*4, out)), - param(init(out*4)), - param(zeros(Float32, out)), - param(zeros(Float32, out)), + init(out*4, in), + init(out*4, out), + init(out*4), + reshape(zeros(Float32, out), out, 1), + reshape(zeros(Float32, out), out, 1), p, drop_mask((out*4, in), p), drop_mask((out*4, out), p), true ) - cell.b.data[gate(out, 2)] .= 1 + cell.b[gate(out, 2)] .= 1 return cell end @@ -88,9 +88,12 @@ function (m::WeightDroppedLSTMCell)((h, c), x) return (h′, c), h′ end -Flux.@treelike WeightDroppedLSTMCell +Flux.@functor WeightDroppedLSTMCell -_testmode!(m::WeightDroppedLSTMCell, test) = (m.active = !test) +Flux.trainable(m::WeightDroppedLSTMCell) = (m.Wi, m.Wh, m.b, m.h, m.c) + +testmode!(m::WeightDroppedLSTMCell, mode=true) = + (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) """ WeightDroppedLSTM(in::Integer, out::Integer, p::Float64=0.0) @@ -106,9 +109,25 @@ julia> wd = WeightDroppedLSTM(4, 5, 0.3); function WeightDroppedLSTM(a...; kw...) cell = WeightDroppedLSTMCell(a...;kw...) hidden = (cell.h, cell.c) - return Flux.Recur(cell, hidden, hidden) + return Flux.Recur(cell, hidden) end +""" + reset!(m) + +Resets the h, c parameters of the LSTM Cell. + +For more refer [`Flux.reset`](@ref https://fluxml.ai/Flux.jl/stable/models/layers/#Flux.reset!) +""" +function reset!(m) + try # to accomodate the definition in previously trained Language Model + (m.state = (m.cell.h, m.cell.c)) + catch + Flux.reset!(m) + end +end + + """ reset_masks!(layer) @@ -155,7 +174,9 @@ end AWD_LSTM(in::Integer, out::Integer, p::Float64=0.0; kw...) = AWD_LSTM(WeightDroppedLSTM(in, out, p; kw...), -1, []) -Flux.@treelike AWD_LSTM +Flux.@functor AWD_LSTM + +Flux.trainable(m::AWD_LSTM) = (m.layer,) (m::AWD_LSTM)(in) = m.layer(in) @@ -184,12 +205,12 @@ function asgd_step!(iter::Integer, layer::AWD_LSTM) p = get_trainable_params([layer]) avg_fact = 1/max(iter - layer.T + 1, 1) if avg_fact != 1 - layer.accum = layer.accum .+ Tracker.data.(p) + layer.accum = layer.accum .+ p for (ps, accum) in zip(p, layer.accum) - Tracker.data(ps) .= avg_fact*accum + ps .= avg_fact*accum end else - layer.accum = deepcopy(Tracker.data.(p)) # Accumulator for ASGD + layer.accum = deepcopy(p) # Accumulator for ASGD end end return @@ -230,7 +251,8 @@ function (vd::VarDrop)(x) return (x .* vd.mask) end -_testmode!(vd::VarDrop, test) = (vd.active = !test) +testmode!(m::VarDrop, mode=true) = + (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) # method for reseting mask of VarDrop reset_masks!(vd::VarDrop) = (vd.reset = true) @@ -270,7 +292,7 @@ end function DroppedEmbeddings(in::Integer, embed_size::Integer, p::Float64=0.0; init = Flux.glorot_uniform) de = DroppedEmbeddings{AbstractArray, typeof(p)}( - param(init(in, embed_size)), + init(in, embed_size), p, drop_mask((in,), p), true @@ -283,9 +305,12 @@ function (de::DroppedEmbeddings)(x::AbstractArray, tying::Bool=false) return tying ? dropped * x : transpose(dropped[x, :]) end -Flux.@treelike DroppedEmbeddings +Flux.@functor DroppedEmbeddings + +Flux.trainable(m::DroppedEmbeddings) = (m.emb,) -_testmode!(de::DroppedEmbeddings, test) = (de.active = !test) +testmode!(m::DroppedEmbeddings, mode=true) = + (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) function reset_masks!(de::DroppedEmbeddings) de.mask = drop_mask(de.mask, de.p) @@ -324,10 +349,10 @@ PooledDense(W, b) = PooledDense(W, b, identity) function PooledDense(hidden_sz::Integer, out::Integer, σ = identity; initW = Flux.glorot_uniform, initb = (dims...) -> zeros(Float32, dims...)) -return PooledDense(param(initW(out, hidden_sz*3)), param(initb(out)), σ) +return PooledDense(initW(out, hidden_sz*3), initb(out), σ) end -Flux.@treelike PooledDense +Flux.@functor PooledDense function (a::PooledDense)(x) W, b, σ = a.W, a.b, a.σ diff --git a/src/ULMFiT/data_loaders.jl b/src/ULMFiT/data_loaders.jl index f59e403..839b408 100644 --- a/src/ULMFiT/data_loaders.jl +++ b/src/ULMFiT/data_loaders.jl @@ -27,29 +27,29 @@ function imdb_preprocess(doc::AbstractDocument) length(word) == 1 && return [word] return split(word, symbol) end - text = text(doc) - remove_corrupt_utf8!(text) - remove_case!(text) - prepare!(text, strip_html_tags) - tokens = tokens(text) + text_ = doc + remove_corrupt_utf8!(text_) + remove_case!(text_) + prepare!(text_, strip_html_tags) + tokens_ = tokens(text_) for symbol in [',', '.', '-', '/', "'s"] - tokens = split_word.(tokens, symbol) + tokens_ = split_word.(tokens_, symbol) temp = [] - for token in tokens + for token_ in tokens_ try - append!(temp, put(token, symbol)) + append!(temp, put(token_, symbol)) catch - append!(temp, token) + append!(temp, token_) end end - tokens = temp + tokens_ = temp end - deleteat!(tokens, findall(x -> isequal(x, "")||isequal(x, " "), tokens)) - return tokens + deleteat!(tokens_, findall(x -> isequal(x, "")||isequal(x, " "), tokens_)) + return tokens_ end # Loads WikiText-103 corpus and output a Channel to give a mini-batch at each call -function load_wikitext_103(batchsize::Integer, bptt::Integer; type = "train") +function load_wikitext_103(batchsize::Integer=16, bptt::Integer=70; type = "train") corpuspath = joinpath(datadep"WikiText-103", "wiki.$(type).tokens") corpus = read(open(corpuspath, "r"), String) corpus = tokenize(corpus) @@ -58,13 +58,13 @@ end # IMDB Data loaders for Sentiment Analysis specifically # IMDB data loader for fine-tuning Language Model -function imdb_fine_tune_data(batchsize::Integer, bptt::Integer, num_examples::Integer=50000) +function imdb_fine_tune_data(batchsize::Integer=16, bptt::Integer=70, num_examples::Integer=50000) imdb_dataset = IMDB("train_unsup") dataset = [] - for path in imdb_dataset.filepaths #extract data from the files in directory and put into channel + for path in imdb_dataset.filepaths[1:num_examples] #extract data from the files in directory and put into channel open(path) do fileio cur_text = read(fileio, String) - append!(dataset, imdb_preprocess(cur_text)) + append!(dataset, imdb_preprocess(StringDocument(cur_text))) end #open end #for return Channel(x -> generator(x, dataset; batchsize=batchsize, bptt=bptt)) diff --git a/src/ULMFiT/fine_tune_lm.jl b/src/ULMFiT/fine_tune_lm.jl index 17f33b9..22a08d3 100644 --- a/src/ULMFiT/fine_tune_lm.jl +++ b/src/ULMFiT/fine_tune_lm.jl @@ -24,17 +24,17 @@ opts : `Vector` of optimizers used to update weights for corresponding la NOTE: length(opts) == length(layers) """ -function discriminative_step!(layers, ηL::Float64, l, opts::Vector) +function discriminative_step!(layers, lm::LanguageModel, gen, ηL::Float64, opts::Vector) @assert length(opts) == length(layers) # Gradient calculation - grads = Tracker.gradient(() -> l, get_trainable_params(layers)) + grads = Zygote.gradient(() -> loss(lm, gen), get_trainable_params(layers)) # discriminative step ηl = ηL/(2.6^(length(layers)-1)) for (layer, opt) in zip(layers, opts) opt.eta = ηl for ps in get_trainable_params([layer]) - Tracker.update!(opt, ps, grads[ps]) + Flux.Optimise.update!(opt, ps, grads[ps]) end ηl *= 2.6 end @@ -47,35 +47,31 @@ end epochs::Integer=1, checkpoint_itvl::Integer=5000) This function contains main training loops for fine-tuning the language model. -To use this funciton, an instance of LanguageModel and a data loader is needed. +To use this function, an instance of LanguageModel and a data loader is needed. Read the docs for more info about arguments """ -function fine_tune_lm!(lm::LanguageModel, data_loader::Channel=imdb_fine_tune_data, - stlr_cut_frac::Float64=0.1, stlr_ratio::Float32=32, stlr_η_max::Float64=4e-3; +function fine_tune_lm!(lm=LanguageModel(), data_loader=imdb_fine_tune_data, + stlr_cut_frac::Float64=0.1, stlr_ratio::Float32=Float32(32), stlr_η_max::Float64=4e-3; epochs::Integer=1, checkpoint_itvl::Integer=5000) opts = [ADAM(0.001, (0.7, 0.99)) for i=1:4] - cut = num_of_iters * epochs * stlr_cut_frac - + # Fine-Tuning loops for epoch=1:epochs println("\nEpoch: $epoch") - gen = data_loader() - num_of_iters = take!(gen) + gen = data_loader() + num_of_iters = take!(gen) + cut = num_of_iters * epochs * stlr_cut_frac T = num_of_iters-Int(floor((num_of_iters*2)/100)) set_trigger!.(T, lm.layers) for i=1:num_of_iters - - # FORWARD - l = loss(lm, gen) - # Slanted triangular learning rate step t = i + (epoch-1)*num_of_iters p_frac = (i < cut) ? i/cut : (1 - ((i-cut)/(cut*(1/stlr_cut_frac-1)))) ηL = stlr_η_max*((1+p_frac*(stlr_ratio-1))/stlr_ratio) # Backprop with discriminative fine-tuning step - discriminative_step!(lm.layers[[1, 3, 5, 7]], ηL, l, opts) + discriminative_step!(lm.layers[[1, 3, 5, 7]], lm, gen, ηL, opts) # Resets dropout masks for all the layers with DropOut or DropConnect reset_masks!.(lm.layers) @@ -121,7 +117,7 @@ julia> insert!(vocab, 2, "_pad_") function set_vocab!(lm::LanguageModel, vocab::Vector) idxs = indices(vocab, lm.vocab) lm.vocab = vocab - lm.layers[1].emb = param(Tracker.data(lm.layers[1].emb)[idxs, :]) + lm.layers[1].emb = param(lm.layers[1].emb[idxs, :]) lm.layers[1].mask = gpu(drop_mask((length(vocab),), lm.layers[1].p)) return end diff --git a/src/ULMFiT/pretrain_lm.jl b/src/ULMFiT/pretrain_lm.jl index 74bc573..e659f8e 100644 --- a/src/ULMFiT/pretrain_lm.jl +++ b/src/ULMFiT/pretrain_lm.jl @@ -49,7 +49,7 @@ function LanguageModel(load_pretrained::Bool=false, vocabpath::String=joinpath(@ return lm end -Flux.@treelike LanguageModel +Flux.@functor LanguageModel """ test_lm(lm::LanguageModel, data_gen, num_of_iters::Integer; unknown_token::String="_unk_") @@ -63,7 +63,7 @@ It returns loss, accuracy, precsion, recall and F1 score. julia> test_lm(lm, data_gen, 200, " indices(x, lm.vocab, "_unk_"), batch) + batch = gpu(batch) batch = lm.layers.(batch) return batch end @@ -101,17 +102,17 @@ end function loss(lm, gen) H = forward(lm, take!(gen)) Y = broadcast(x -> gpu(Flux.onehotbatch(x, lm.vocab, "_unk_")), take!(gen)) - l = sum(crossentropy.(H, Y)) - Flux.truncate!(lm.layers) + l = sum(Flux.crossentropy.(H, Y)) + reset!(lm.layers) return l end # Backpropagation step while training -function backward!(layers, l, opt) +function backward!(layers, lm, gen, opt) # Calulating gradients and weights updation p = get_trainable_params(layers) - grads = Tracker.gradient(() -> l, p) - Tracker.update!(opt, p, grads) + grads = Zygote.gradient(() -> loss(lm, gen), p) + Flux.Optimise.update!(opt, p, grads) return end @@ -138,11 +139,8 @@ function pretrain_lm!(lm::LanguageModel=LanguageModel(), data_loader::Channel=lo set_trigger!.(T, lm.layers) # Setting triggers for AWD_LSTM layers for i=1:num_of_batches - # FORWARD PASS - l = loss(lm, gen) - # REVERSE PASS - backward!(lm.layers, l, opt) + backward!(lm.layers, lm, gen, opt) # ASGD Step, works after Triggering asgd_step!.(i, lm.layers) @@ -158,13 +156,18 @@ end # To save model function save_model!(m::LanguageModel, filepath::String) - weights = cpu.(Tracker.data.(params(m))) + weights = cpu.(params(m)) BSON.@save filepath weights end # To load model function load_model!(lm::LanguageModel, filepath::String) BSON.@load filepath weights + # reshape saved weights to match Recurr (h, c) shape + layers = [5, 6, 10, 11, 15, 16] + for l in layers + weights[l] = reshape(weights[l], length(weights[l]), 1) + end Flux.loadparams!(lm, weights) end @@ -182,7 +185,7 @@ SAMPLING... """ function sample(starting_text::AbstractDocument, lm::LanguageModel) testmode!(lm.layers) - model_layers = mapleaves(Tracker.data, lm.layers) + model_layers = lm.layers tokens = tokens(starting_text) word_indices = map(x -> indices([x], lm.vocab, "_unk_"), tokens) h = (model_layers.(word_indices))[end] diff --git a/src/ULMFiT/sentiment.jl b/src/ULMFiT/sentiment.jl index c70069d..3ab5479 100644 --- a/src/ULMFiT/sentiment.jl +++ b/src/ULMFiT/sentiment.jl @@ -48,12 +48,12 @@ function BinSentimentClassifier() ) ) Flux.loadparams!(sc, weights) - sc = mapleaves(Tracker.data, sc) + sc = sc Flux.testmode!(sc) return sc end -Flux.@treelike BinSentimentClassifier +Flux.@functor BinSentimentClassifier function (sc::BinSentimentClassifier)(x::TokenDocument) remove_case!(x) diff --git a/src/ULMFiT/train_text_classifier.jl b/src/ULMFiT/train_text_classifier.jl index e30912f..702bd21 100644 --- a/src/ULMFiT/train_text_classifier.jl +++ b/src/ULMFiT/train_text_classifier.jl @@ -30,7 +30,7 @@ function TextClassifier(lm::LanguageModel=LanguageModel(), clsfr_out_sz::Integer ) end -Flux.@treelike TextClassifier +Flux.@functor TextClassifier """ Cross Validate @@ -48,7 +48,7 @@ gen will be used for validation """ function validate(tc::TextClassifier, gen::Channel, num_of_batches::Union{Colon, Integer}) n_classes = size(tc.linear_layers[end-2].W, 1) - classifier = mapleaves(Tracker.data, tc) + classifier = tc Flux.testmode!(classifier) loss = 0 iters = take!(gen) @@ -91,15 +91,17 @@ tracked_steps : This is the number of tracked time-steps for Truncated Backpro """ function forward(tc::TextClassifier, gen::Channel, tracked_steps::Integer=32) # swiching off tracking - classifier = mapleaves(Tracker.data, tc) + classifier = tc X = take!(gen) l = length(X) # Truncated Backprop through time - for i=1:ceil(l/now_per_pass)-1 # Tracking is swiched off inside this loop - (i == 1 && l%now_per_pass != 0) ? (last_idx = l%now_per_pass) : (last_idx = now_per_pass) - H = broadcast(x -> indices(x, classifier.vocab, "_unk_"), X[1:last_idx]) - H = classifier.rnn_layers.(H) - X = X[last_idx+1:end] + Zygote.ignore() do + for i=1:ceil(l/tracked_steps)-1 # Tracking is swiched off inside this loop + (i == 1 && l%tracked_steps != 0) ? (last_idx = l%tracked_steps) : (last_idx = tracked_steps) + H = broadcast(x -> indices(x, classifier.vocab, "_unk_"), X[1:last_idx]) + H = classifier.rnn_layers.(H) + X = X[last_idx+1:end] + end end # set the lated hidden states to original model for (t_layer, unt_layer) in zip(tc.rnn_layers[2:end], classifier.rnn_layers[2:end]) @@ -130,7 +132,7 @@ Arguments: classifier : Instance of TextClassifier gen : 'Channel' [data loader], to give a mini-batch -tracked_words : specifies the number of time-steps for which tracking is on +tracked_steps : specifies the number of time-steps for which tracking is on """ function loss(classifier::TextClassifier, gen::Channel, tracked_steps::Integer=32) H = forward(classifier, gen, tracked_steps) @@ -140,6 +142,23 @@ function loss(classifier::TextClassifier, gen::Channel, tracked_steps::Integer=3 return l end +function discriminative_step!(layers, classifier::TextClassifier, gen::Channel, tracked_steps::Integer, ηL::Float64, opts::Vector) + @assert length(opts) == length(layers) + # Gradient calculation + grads = Zygote.gradient(() -> loss(classifier, gen, tracked_steps = tracked_steps), get_trainable_params(layers)) + + # discriminative step + ηl = ηL/(2.6^(length(layers)-1)) + for (layer, opt) in zip(layers, opts) + opt.eta = ηl + for ps in get_trainable_params([layer]) + Flux.Optimise.update!(opt, ps, grads[ps]) + end + ηl *= 2.6 + end + return +end + """ train_classifier!(classifier::TextClassifier=TextClassifier(), classes::Integer=1, data_loader::Channel=imdb_classifier_data, hidden_layer_size::Integer=50;kw...) @@ -151,7 +170,7 @@ function train_classifier!(classifier::TextClassifier=TextClassifier(), classes: data_loader::Channel=imdb_classifier_data, hidden_layer_size::Integer=50; stlr_cut_frac::Float64=0.1, stlr_ratio::Number=32, stlr_η_max::Float64=0.01, val_loader::Channel=nothing, cross_val_batches::Union{Colon, Integer}=:, - epochs::Integer=1, checkpoint_itvl=5000) + epochs::Integer=1, checkpoint_itvl=5000, tracked_steps::Integer=32) trainable = [] append!(trainable, [classifier.rnn_layers[[1, 3, 5, 7]]...]) @@ -166,7 +185,6 @@ function train_classifier!(classifier::TextClassifier=TextClassifier(), classes: num_of_iters = take!(gen) cut = num_of_iters * epochs * stlr_cut_frac for iter=1:num_of_iters - l = loss(classifier, gen, now_per_pass = now_per_pass) # Slanted triangular learning rates t = iter + (epoch-1)*num_of_iters @@ -175,7 +193,7 @@ function train_classifier!(classifier::TextClassifier=TextClassifier(), classes: # Gradual-unfreezing Step with discriminative fine-tuning unfreezed_layers, cur_opts = (epoch < length(trainable)) ? (trainable[end-epoch+1:end], opts[end-epoch+1:end]) : (trainable, opts) - discriminative_step!(unfreezed_layers, ηL, l, cur_opts) + discriminative_step!(unfreezed_layers, classifier, gen, tracked_steps,ηL, cur_opts) reset_masks!.(classifier.rnn_layers) # reset all dropout masks end @@ -203,13 +221,13 @@ All the preprocessing related to the used vocabulary should be done before using Use `prepare!` function to do preprocessing """ function predict(tc::TextClassifier, text_sents::Corpus) - classifier = mapleaves(Tracker.data, tc) + classifier = tc Flux.testmode!(classifier) predictions = [] expr(x) = indices(x, classifier.vocab, "_unk_") for text in text_sents - tokens = tokens(text) - h = classifier.rnn_layers.(expr.(tokens)) + tokens_ = tokens(text) + h = classifier.rnn_layers.(expr.(tokens_)) probability_dist = classifier.linear_layers(h) class = argmax(probaility_dist) push!(predictions, class) diff --git a/src/ULMFiT/utils.jl b/src/ULMFiT/utils.jl index 691354f..64bfd11 100644 --- a/src/ULMFiT/utils.jl +++ b/src/ULMFiT/utils.jl @@ -27,8 +27,8 @@ end init_weights(extreme::AbstractFloat, dims...) = randn(Float32, dims...) .* sqrt(Float32(extreme)) # Generator, whenever it should be called two times since it gives X in first and y in second call -function generator(c::Channel, corpus::AbstractDocument; batchsize::Integer=64, bptt::Integer=70) - X_total = post_pad_sequences(chunk(tokens(corpus), batchsize)) +function generator(c::Channel, corpus; batchsize::Integer=64, bptt::Integer=70) + X_total = post_pad_sequences(Flux.chunk(corpus, batchsize)) n_batches = Int(floor(length(X_total[1])/bptt)) put!(c, n_batches) for i=1:n_batches diff --git a/src/sequence/pos.jl b/src/sequence/pos.jl index 9346a3a..b23c210 100644 --- a/src/sequence/pos.jl +++ b/src/sequence/pos.jl @@ -1,4 +1,4 @@ -using BSON, Tracker +using BSON const PoSCharUNK = '¿' const PoSWordUNK = "" diff --git a/src/sequence/sequence_models.jl b/src/sequence/sequence_models.jl index b19e6a0..8b4a3a6 100644 --- a/src/sequence/sequence_models.jl +++ b/src/sequence/sequence_models.jl @@ -1,4 +1,4 @@ -using BSON, Tracker +using BSON mutable struct BiLSTM_CNN_CRF_Model{C, W, L, D, O, A} labels::Array{String, 1} # List of Labels chars_idx#::Dict{Char, Integer} # Dict that maps chars to indices in W_Char_Embed @@ -33,32 +33,32 @@ function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Wor init_α[n + 1] = 0 # Word and Character Embeddings. - W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu] - W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu] + W_word_Embed = BSON.load(joinpath(weights_path, "W_word_cpu.bson"))[:W_word_cpu][:, 1:end-1] # no padding char token here + W_Char_Embed = BSON.load(joinpath(weights_path, "W_char_cpu.bson"))[:W_char_cpu][:, 1:end-1] # no padding word token here # Forward_LSTM forward_wts = BSON.load(joinpath(weights_path, "forward_lstm.bson")) forward_lstm = Flux.Recur(Flux.LSTMCell(forward_wts[:lstm_2], # Wi forward_wts[:lstm_1], # Wh forward_wts[:lstm_3], # b - forward_wts[:lstm_4], # h - forward_wts[:lstm_5] # c + (reshape(forward_wts[:lstm_4], length(forward_wts[:lstm_4]), 1), # h + reshape(forward_wts[:lstm_5], length(forward_wts[:lstm_5]), 1)) # c ), - forward_wts[:lstm_init], - forward_wts[:lstm_state] - ) + (reshape(forward_wts[:lstm_state][1], length(forward_wts[:lstm_state][1]), 1), # h + reshape(forward_wts[:lstm_state][2], length(forward_wts[:lstm_state][2]), 1)) + ) # Backward_LSTM backward_wts = BSON.load(joinpath(weights_path, "backward_lstm.bson")) backward = Flux.Recur(Flux.LSTMCell(backward_wts[:lstm_2], # Wi backward_wts[:lstm_1], # Wh backward_wts[:lstm_3], # b - backward_wts[:lstm_4], # h - backward_wts[:lstm_5] # c - ), - backward_wts[:lstm_init], - backward_wts[:lstm_state] - ) + (reshape(backward_wts[:lstm_4], length(backward_wts[:lstm_4]), 1), # h + reshape(backward_wts[:lstm_5], length(backward_wts[:lstm_5]), 1)) # c + ), + (reshape(backward_wts[:lstm_state][1], length(backward_wts[:lstm_state][1]), 1), # h + reshape(backward_wts[:lstm_state][2], length(backward_wts[:lstm_state][2]), 1)) + ) # Dense d_weights_bias = BSON.load(joinpath(weights_path, "d_cpu.bson")) @@ -69,7 +69,7 @@ function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Wor # Load CRF. crf_wt = BSON.load(joinpath(weights_path, "crf_cpu.bson"))[:crf_Weights] - c = TextModels.CRF(crf_wt, size(crf_wt)[1] - 2) + c = CRF(crf_wt, size(crf_wt)[1] - 2) # Load Conv conv_wt_bias = BSON.load(joinpath(weights_path, "conv_cpu.bson")) @@ -79,6 +79,8 @@ function BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, UNK_char_idx,UNK_Wor (1, 1), # stride (0, 2), # pad (1, 1), # dilation + 1 # groups + ) BiLSTM_CNN_CRF_Model(labels, chars_idx, words_idx, conv1, W_Char_Embed, W_word_Embed, @@ -100,7 +102,7 @@ function (a::BiLSTM_CNN_CRF_Model)(x) oh_outs = viterbi_decode(a.c, m(x), a.init_α) Flux.reset!(a.backward) Flux.reset!(a.forward_lstm) - [a.labels[oh.ix] for oh in oh_outs] + [a.labels[oh.indices] for oh in oh_outs] end onehotinput(m::BiLSTM_CNN_CRF_Model, word) = (onehot(get(m.words_idx, lowercase(word), m.UNK_Word_idx), 1:length(m.words_idx)), diff --git a/test/crf.jl b/test/crf.jl index 34237d2..a548a4b 100644 --- a/test/crf.jl +++ b/test/crf.jl @@ -1,5 +1,5 @@ using Flux -using Flux: gradient, LSTM, Dense, reset!, onehot, RNN +using Flux: gradient, LSTM, Dense, reset!, onehot, RNN, params using TextModels: score_sequence, forward_score @testset "crf" begin @@ -108,7 +108,7 @@ using TextModels: score_sequence, forward_score init_α = fill(-10000, (c.n + 2, 1)) init_α[c.n + 1] = 0 - loss(xs, ys) = crf_loss(c, m(xs), ys, init_α) + loss(xs, ys) = crf_loss(c, m(xs), ys, init_α) + 1e-4*sum(c.W.*c.W) opt = Descent(0.01) data = zip(X, Y) @@ -117,29 +117,29 @@ using TextModels: score_sequence, forward_score function train() for d in data - reset!(lstm) - grads = Tracker.gradient(() -> loss(d[1], d[2]), ps) + Flux.reset!(lstm) + grads = gradient(() -> loss(d[1], d[2]), ps) Flux.Optimise.update!(opt, ps, grads) end end function find_loss(d) - reset!(lstm) + Flux.reset!(lstm) loss(d[1], d[2]) end to_sum = [find_loss(d) for d in data] l1 = sum(to_sum) - dense_param_1 = deepcopy(Tracker.data(d_out.W)) - lstm_param_1 = deepcopy(Tracker.data(lstm.cell.Wh)) - crf_param_1 = deepcopy(Tracker.data(c.W)) + dense_param_1 = deepcopy(d_out.W) + lstm_param_1 = deepcopy(lstm.cell.Wh) + crf_param_1 = deepcopy(c.W) for i in 1:10 train() end - dense_param_2 = deepcopy(Tracker.data(d_out.W)) - lstm_param_2 = deepcopy(Tracker.data(lstm.cell.Wh)) - crf_param_2 = deepcopy(Tracker.data(c.W)) + dense_param_2 = deepcopy(d_out.W) + lstm_param_2 = deepcopy(lstm.cell.Wh) + crf_param_2 = deepcopy(c.W) l2 = sum([find_loss(d) for d in data]) @test l1 > l2 @@ -148,3 +148,4 @@ using TextModels: score_sequence, forward_score @test crf_param_1 != crf_param_2 end end + diff --git a/test/runtests.jl b/test/runtests.jl index 1bcac94..2738bfa 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,6 +7,6 @@ println("Running tests:") include("crf.jl") include("ner.jl") include("pos.jl") +include("sentiment.jl") include("averagePerceptronTagger.jl") include("ulmfit.jl") -include("sentiment.jl") diff --git a/test/ulmfit.jl b/test/ulmfit.jl index 8ea0092..3deca62 100644 --- a/test/ulmfit.jl +++ b/test/ulmfit.jl @@ -4,7 +4,7 @@ using BSON @testset "Custom layers" begin @testset "WeightDroppedLSTM" begin wd = ULMFiT.WeightDroppedLSTM(4, 5, 0.3) - @test all(wd.init .== wd.state) + @test all((wd.cell.h, wd.cell.c) .== wd.state) @test size(wd.cell.Wi) == size(wd.cell.maskWi) @test size(wd.cell.Wh) == size(wd.cell.maskWh) @test wd.cell.active @@ -31,10 +31,10 @@ using BSON ULMFiT.asgd_step!(4, awd) @test length(awd.accum) == 3 temp = deepcopy(awd.accum[1][1]) - @test temp == Tracker.data(awd.layer.cell.Wi[1]) + @test temp == awd.layer.cell.Wi[1] ULMFiT.asgd_step!(5, awd) temp += temp - @test temp == Tracker.data(awd.accum[1][1]) + @test temp == awd.accum[1][1] @test length(params(awd)) == 5 end @@ -95,6 +95,12 @@ end @test length(ULMFiT.get_trainable_params(lm.layers)) == 10 pretrained_weights = BSON.load(datadep"Pretrained ULMFiT Language Model/ulmfit_lm_en.bson") + # reshape weights of (h, c) + layers = [5, 6, 10, 11, 15, 16] + for i in layers + pretrained_weights[:weights][i] = reshape(pretrained_weights[:weights][i], length(pretrained_weights[:weights][i]), 1) + end + @test length(pretrained_weights[:weights]) == 16 @test all(size.(params(lm)) .== size.(pretrained_weights[:weights])) end