## ----setup, include=FALSE-----------------------------------------------------
# Vignette code is executed locally (NOT_CRAN=true) but not on CRAN, where
# the CPU fallback would multi-thread and trip the "CPU time > elapsed" NOTE.
knitr::opts_chunk$set(eval = identical(Sys.getenv("NOT_CRAN"), "true"))

## -----------------------------------------------------------------------------
# library(ggmlR)

## -----------------------------------------------------------------------------
# # ag_tensor — non-trainable input (e.g. data)
# x <- ag_tensor(matrix(1:6 / 6, nrow = 2L))   # [2, 3]
# 
# # ag_param — trainable parameter (gradient accumulated)
# W <- ag_param(matrix(rnorm(4), nrow = 2L))    # [2, 2]
# b <- ag_param(matrix(0.0, 2L, 1L))

## -----------------------------------------------------------------------------
# with_grad_tape({
#   h    <- ag_relu(ag_add(ag_matmul(W, x), b))   # [2, 3]
#   loss <- ag_mean(ag_mul(h, h))                  # scalar MSE-like
# })
# 
# grads <- backward(loss)   # returns named list of gradients
# 
# cat("dL/dW:\n"); print(grads[["W"]])
# cat("dL/db:\n"); print(grads[["b"]])

## -----------------------------------------------------------------------------
# data(iris)
# set.seed(42)
# 
# x_all <- t(scale(as.matrix(iris[, 1:4])))        # [4, 150]
# y_oh  <- model.matrix(~ Species - 1, iris)
# y_all <- t(y_oh)                                  # [3, 150]
# 
# idx  <- sample(150L)
# x_tr <- x_all[, idx[1:120]];  x_vl <- x_all[, idx[121:150]]
# y_tr <- y_all[, idx[1:120]];  y_vl <- y_all[, idx[121:150]]
# 
# model <- ag_sequential(
#   ag_linear(4L,  64L, activation = "relu"),
#   ag_batch_norm(64L),
#   ag_dropout(0.3),
#   ag_linear(64L, 32L, activation = "relu"),
#   ag_linear(32L,  3L)
# )
# 
# params <- model$parameters()
# cat("Parameter tensors:", length(params), "\n")

## -----------------------------------------------------------------------------
# # Adam (default lr = 1e-3)
# opt <- optimizer_adam(params, lr = 1e-3)
# 
# # SGD with momentum
# opt_sgd <- optimizer_sgd(params, lr = 0.05, momentum = 0.9)

## -----------------------------------------------------------------------------
# BS <- 32L
# n  <- ncol(x_tr)
# 
# ag_train(model)   # set training mode (enables dropout, batch norm train)
# set.seed(1)
# 
# for (ep in seq_len(150L)) {
#   perm <- sample(n)
#   for (b in seq_len(ceiling(n / BS))) {
#     idx <- perm[seq((b-1L)*BS + 1L, min(b*BS, n))]
#     xb  <- ag_tensor(x_tr[, idx, drop = FALSE])
#     yb  <- y_tr[, idx, drop = FALSE]
# 
#     with_grad_tape({
#       loss <- ag_softmax_cross_entropy_loss(model$forward(xb), yb)
#     })
#     grads <- backward(loss)
#     opt$step(grads)
#     opt$zero_grad()
#   }
# 
#   if (ep %% 50L == 0L)
#     cat(sprintf("epoch %3d  loss %.4f\n", ep, loss$data[1]))
# }

## -----------------------------------------------------------------------------
# opt2 <- optimizer_adam(params, lr = 1e-3)
# 
# # Cosine annealing: lr goes from lr_max to lr_min over T_max epochs
# sch_cos <- lr_scheduler_cosine(opt2, T_max = 150L, lr_min = 1e-5)
# 
# # Step decay: multiply lr by gamma every step_size epochs
# sch_step <- lr_scheduler_step(opt2, step_size = 30L, gamma = 0.5)
# 
# # Call after each epoch:
# # sch_cos$step()

## -----------------------------------------------------------------------------
# with_grad_tape({
#   loss <- ag_softmax_cross_entropy_loss(model$forward(ag_tensor(x_tr)), y_tr)
# })
# grads <- backward(loss)
# 
# # Clip global gradient norm to max_norm
# clip_grad_norm(params, grads, max_norm = 5.0)
# 
# opt$step(grads)
# opt$zero_grad()

## -----------------------------------------------------------------------------
# dl <- ag_dataloader(x_tr, y_tr, batch_size = BS, shuffle = TRUE)
# 
# ag_train(model)
# for (ep in seq_len(100L)) {
#   for (batch in dl$epoch()) {
#     with_grad_tape({
#       loss <- ag_softmax_cross_entropy_loss(model$forward(batch$x), batch$y$data)
#     })
#     grads <- backward(loss)
#     opt$step(grads);  opt$zero_grad()
#   }
# }

## -----------------------------------------------------------------------------
# ag_eval(model)   # disables dropout, switches batch norm to inference stats
# 
# # Forward in chunks to avoid memory pressure
# predict_cm <- function(mod, x_cm, chunk = 64L) {
#   n   <- ncol(x_cm)
#   out <- NULL
#   for (s in seq(1L, n, by = chunk)) {
#     e  <- min(s + chunk - 1L, n)
#     lg <- mod$forward(ag_tensor(x_cm[, s:e, drop = FALSE]))$data
#     ev <- exp(lg - apply(lg, 2, max))
#     sm <- ev / colSums(ev)
#     out <- if (is.null(out)) sm else cbind(out, sm)
#   }
#   out
# }
# 
# probs <- predict_cm(model, x_vl)          # [3, 30]
# preds <- apply(probs, 2, which.max)
# truth <- apply(y_vl, 1, which.max)
# cat(sprintf("Val accuracy: %.4f\n", mean(preds == truth)))

## -----------------------------------------------------------------------------
# set.seed(7)
# W1 <- ag_param(matrix(rnorm(64*4) * sqrt(2/4),  64, 4))
# b1 <- ag_param(matrix(0.0, 64, 1))
# W2 <- ag_param(matrix(rnorm(3*64) * sqrt(2/64),  3, 64))
# b2 <- ag_param(matrix(0.0,  3, 1))
# 
# forward <- function(x)
#   ag_add(ag_matmul(W2, ag_relu(ag_add(ag_matmul(W1, x), b1))), b2)
# 
# opt_raw <- optimizer_adam(list(W1=W1, b1=b1, W2=W2, b2=b2), lr = 1e-3)
# 
# for (ep in seq_len(200L)) {
#   perm <- sample(n)
#   for (b in seq_len(ceiling(n / BS))) {
#     idx <- perm[seq((b-1L)*BS+1L, min(b*BS, n))]
#     xb  <- ag_tensor(x_tr[, idx, drop = FALSE])
#     yb  <- y_tr[, idx, drop = FALSE]
#     with_grad_tape({ loss_r <- ag_softmax_cross_entropy_loss(forward(xb), yb) })
#     gr <- backward(loss_r)
#     opt_raw$step(gr);  opt_raw$zero_grad()
#   }
# }

## -----------------------------------------------------------------------------
# # f16 on GPU, f32 on CPU
# device <- tryCatch({ ag_device("gpu"); "gpu" }, error = function(e) "cpu")
# ag_dtype(if (device == "gpu") "f16" else "f32")
# 
# # All subsequent ag_param / ag_tensor use the selected dtype

## -----------------------------------------------------------------------------
# # Full example: inst/examples/dp_train_demo.R