Lhcfl
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lm/Cargo.lock‎
Lines changed: 71 additions & 5 deletions b/‎lm/Cargo.lock‎
Lines changed: 71 additions & 5 deletions
diff --git a/‎lm/Cargo.toml‎
Lines changed: 15 additions & 2 deletions b/‎lm/Cargo.toml‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎lm/README.md‎
Lines changed: 38 additions & 0 deletions b/‎lm/README.md‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎lm/src/bin/infer.rs‎
Lines changed: 88 additions & 0 deletions b/‎lm/src/bin/infer.rs‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎lm/src/bin/inspect_dataset.rs‎
Lines changed: 22 additions & 0 deletions b/‎lm/src/bin/inspect_dataset.rs‎
Lines changed: 22 additions & 0 deletions
@@ -1,4 +1,6 @@
 /target
+/data
+/lm/data
 *.db
 /result
 .env
 
@@ -4,6 +4,19 @@ version = "0.1.0"
 edition = "2024"
 
 [dependencies]
-anyhow = "1.0.101"
-burn = { version = "0.20.1", features = ["candle", "train", "std"] }
+anyhow = { version = "1.0.101", features = ["backtrace"] }
+burn = { version = "0.20.1", features = ["ndarray", "tch", "train", "std"] }
 tokenizers = { version = "0.22.2" }
+
+serde_json = { version = "1.0.149" }
+rand = "0.10.0"
+memmap2 = "0.5"
+num-traits = "0.2.19"
+color-backtrace = "0.7.2"
+log = "0.4.29"
+
+[[bin]]
+name = "train_tokenizer"
+
+[[bin]]
+name = "run_inference"
@@ -0,0 +1,38 @@
+# LM
+
+A small transformers language model for the bot.
+
+## Inference
+
+Simply run `cargo r --bin infer model_path` with desired prompt as stdin.
+Here the model path is typically `data/model.mpk`.
+
+## Training
+
+First of all, export telegram chat messages to json format and copy the result.json to `data/result.json`.
+Nothing else is needed, and only text messages with some simple markups are accepted.
+Stickers and images would be filtered out and ignored.
+
+Before starting training, some steps must be performed.
+They can easily be done by running the corresponding binary with `cargo r --bin name`:
+
+- Train the tokenizer (MUST be the first step): `train_tokenizer`;
+- Translate the message json into datasets: `trans_dataset`;
+- Run pretrain: `pretrain`.
+
+Training can be continued by simply rerunning pretrain:
+it would automatically read `data/model.mpk` to resume the model.
+And therefore, also remember to remove the model if it's desired to retrain the model or model config changed.
+
+## Changing the model
+
+**Note**: it's better to remove the data dir and rerun the whole training process after changing the model.
+Unless it's known what's being done.
+
+The backend for training and inference (no other process need backend) is `lib.rs::DefaultBackend`.
+Normally changing this after training won't affect inference,
+but note that the default float precision may as well be provided as generic argument (like it's value in HEAD),
+in which case the inference would be affected if the precision is changed.
+
+The model config (layer num, hidden size, etc.) is provided in `impl Default for LmConfig` in `lib.rs`.
+The default config is not clever (0.08b --- too small), and would definitely overfit on small groups/messages.
@@ -0,0 +1,88 @@
+#![recursion_limit = "256"]
+use std::io::{Read, Write};
+
+use burn::{
+    config::Config,
+    module::Module,
+    record::{FullPrecisionSettings, NamedMpkFileRecorder},
+    tensor::{Int, Tensor, activation::softmax},
+};
+use lm::{DefaultBackend, ExtraInfo, LmConfig, LmModel};
+use rand::RngExt;
+use tokenizers::Tokenizer;
+
+fn main() -> anyhow::Result<()> {
+    color_backtrace::BacktracePrinter::new()
+        .strip_function_hash(true)
+        .add_frame_filter(Box::new(|frames| {
+            let crate_path = std::path::Path::new(file!()).canonicalize().unwrap();
+            let crate_path = crate_path.parent().unwrap().parent().unwrap();
+            frames.retain(|f| {
+                f.filename.as_ref().is_some_and(|f| {
+                    f.canonicalize().ok().is_some_and(|f| f.starts_with(crate_path))
+                })
+            });
+        }))
+        .install(color_backtrace::default_output_stream());
+
+    let mut args = std::env::args();
+    args.next();
+    let model_path = args.next().expect("model path argument required");
+
+    let config = LmConfig::load("data/config.json")?;
+    let tokenizer =
+        Tokenizer::from_file("data/tokenizer.json").map_err(anyhow::Error::from_boxed)?;
+
+    let device = Default::default();
+    let model: LmModel<DefaultBackend> = LmModel::new(&config, &device);
+    let recorder = NamedMpkFileRecorder::<FullPrecisionSettings>::new();
+    let model = model.load_file(model_path, &recorder, &device)?;
+
+    let mut prompt = String::new();
+    std::io::stdin().read_to_string(&mut prompt)?;
+    let prompt = prompt.trim();
+    println!("prompt: '{prompt}'");
+
+    let tokens = tokenizer.encode(prompt, true).map_err(anyhow::Error::from_boxed)?;
+    let tokens = tokens.get_ids();
+    let mut info = ExtraInfo::new(&config, true, &device);
+    let mut input =
+        Tensor::<_, 1, Int>::from_ints(tokens, &device).reshape([1, tokens.len() as isize]);
+
+    let mut rng = rand::rng();
+    for _ in 0..config.max_seq_len {
+        let output = softmax(model.forward(input, &mut info), 2);
+        let seq_len = output.dims()[1];
+        let (prob, idx) = output
+            .slice_dim(1, seq_len - 1..)
+            .reshape([config.vocab_size as isize])
+            .topk_with_indices(10, 0);
+        /*println!(
+            "{:?}",
+            idx.clone()
+                .to_data()
+                .iter()
+                .map(|i| (i, tokenizer.decode(&[i], true).unwrap()))
+                .collect::<Vec<_>>()
+        );*/
+        let prob = prob.slice_dim(0, 0..10);
+        let prob = (prob.clone().div(prob.sum_dim(0))).cumsum(0);
+        let mask = prob.greater_elem(rng.random::<f32>()).int().argmax(0).reshape([-1]);
+        let next = idx.select(0, mask).reshape([1, 1]);
+
+        input = next.clone();
+
+        let next = next.into_data().iter::<u32>().next().expect("token");
+        if next == 0 {
+            println!("<|delim|>");
+        } else {
+            print!("{}", tokenizer.decode(&[next], true).map_err(anyhow::Error::from_boxed)?);
+        }
+        std::io::stdout().flush()?;
+        if next == 0 && rng.random::<f32>() < 0.3 {
+            break;
+        }
+    }
+
+    Ok(())
+}
@@ -0,0 +1,22 @@
+use anyhow::Context;
+use burn::data::dataset::Dataset;
+use lm::dataset::{ChatFile, SeqLenWrapper};
+use tokenizers::Tokenizer;
+
+fn main() -> anyhow::Result<()> {
+    let mut args = std::env::args();
+    args.next();
+    let data = ChatFile::new(args.next().context("dataset in arg")?)?;
+    let data = SeqLenWrapper::new(data, 128);
+    let tokenizer =
+        Tokenizer::from_file("data/tokenizer.json").map_err(anyhow::Error::from_boxed)?;
+    println!("dataset len {}", data.len());
+    for i in 0..=data.len() {
+        let Some(it) = data.get(i) else {
+            println!("(Empty at #{i})");
+            continue;
+        };
+        println!("#{i}: {it:?} => '{}'", tokenizer.decode(&it, false).map_err(anyhow::Error::from_boxed)?);
+    }
+    Ok(())
+}
-Original file line number
+Diff line change
@@ @@ -1,4 +1,6 @@ @@
 /target
 +/data
 +/lm/data
 *.db
 /result
 .env