Usage Examples
This section collects practical, runnable examples of SKaiNET in use. Every snippet
is real code from the skainet-docs-samples module — compiled and executed in CI, so
nothing here can drift from the API.
For task-focused guides see the how-to section; for the shortest end-to-end path see Kotlin getting started.
Basic Operations
Matrix Multiplication Examples
The examples below use the real SKaiNET DSL. For the wider tensor construction surface (single vs many tensors, named maps, init strategies), see How-to: Build Tensors. For how matmul dispatches into the kernel layer (and what the benchmarks measure), see Reading the matmul benchmark.
Basic Usage
Simple Matrix Multiplication
import sk.ainet.context.DirectCpuExecutionContext
import sk.ainet.context.data
import sk.ainet.lang.tensor.Tensor
import sk.ainet.lang.tensor.dsl.tensor
import sk.ainet.lang.types.FP32
val ctx = DirectCpuExecutionContext.create()
lateinit var a: Tensor<FP32, Float>
lateinit var b: Tensor<FP32, Float>
data(ctx) {
a = tensor<FP32, Float> {
shape(3, 2) { from(1f, 2f, 3f, 4f, 5f, 6f) }
}
b = tensor<FP32, Float> {
shape(2, 4) {
from(
1f, 0f, 1f, 0f,
0f, 1f, 0f, 1f,
)
}
}
}
val result = ctx.ops.matmul(a, b)
println("Result shape: ${'$'}{result.data.shape}") // Shape(3, 4)
Batch Operations
Matmul is batched whenever the inputs carry a leading batch
dimension; there’s no separate "batched matmul" entry point —
ctx.ops.matmul broadcasts along the leading dim.
lateinit var batchA: Tensor<FP32, Float>
lateinit var batchB: Tensor<FP32, Float>
data(ctx) {
// [batch=2, m=3, k=2]
batchA = tensor<FP32, Float> {
shape(2, 3, 2) {
from(
1f, 2f, 3f, 4f, 5f, 6f, // first sample
2f, 1f, 4f, 3f, 6f, 5f, // second sample
)
}
}
// [batch=2, k=2, n=3]
batchB = tensor<FP32, Float> {
shape(2, 2, 3) {
from(
1f, 0f, 1f, 0f, 1f, 0f,
0f, 1f, 0f, 1f, 0f, 1f,
)
}
}
}
val batchResult = ctx.ops.matmul(batchA, batchB)
// batchResult.data.shape == Shape(2, 3, 3)
Linear Layer
A linear layer is a single matmul plus an optional bias add.
Construction uses the same data { } pattern; the forward pass
goes through ctx.ops:
import kotlin.math.sqrt
class LinearLayer(
private val ctx: DirectCpuExecutionContext,
private val weights: Tensor<FP32, Float>,
private val bias: Tensor<FP32, Float>? = null,
) {
fun forward(input: Tensor<FP32, Float>): Tensor<FP32, Float> {
// input: [batch, in_features]
// weights: [in_features, out_features]
// output: [batch, out_features]
var output = ctx.ops.matmul(input, weights)
if (bias != null) {
output = ctx.ops.add(output, bias) // broadcasting add
}
return output
}
}
val inputSize = 784
val hiddenSize = 256
val batchSize = 32
val std = sqrt(2.0f / (inputSize + hiddenSize))
lateinit var weights: Tensor<FP32, Float>
lateinit var bias: Tensor<FP32, Float>
lateinit var input: Tensor<FP32, Float>
data(ctx) {
weights = tensor<FP32, Float> {
shape(inputSize, hiddenSize) { randn(mean = 0f, std = std) }
}
bias = tensor<FP32, Float> { shape(hiddenSize) { zeros() } }
input = tensor<FP32, Float> {
shape(batchSize, inputSize) { randn(mean = 0f, std = 1f) }
}
}
val layer = LinearLayer(ctx, weights, bias)
val output = layer.forward(input)
// output.data.shape == Shape(32, 256)
Performance Considerations
ctx.ops.matmul automatically routes to the highest-priority
registered kernel via KernelRegistry.bestAvailable(). On JDK 21+
with the incubator Vector module loaded, that’s the Panama Vector
kernel — typically ~14–23 GFLOPS on AVX2 for a 1024³ FP32 GEMM
depending on the workload shape (the mnpack tile-microkernel dispatch
adds ~1.7× over the naive Panama 1×1 inner loop). The numbers and
the kernel selection mechanics are detailed in
Reading the matmul benchmark
and the engine benchmark program.
For quantized matmul (Q4_K, Q8_0, BF16-weight), load weights via
the GGUF / SafeTensors loaders — the
loaders preserve packed-block storage, and the matmul dispatch
recognises the quantized TensorData subtype and routes to the
matching SPI kernel.
Common Patterns
Matrix-Vector Multiplication
A 1D vector is rank 1; matmul against a 2D matrix requires the
vector to be reshaped to rank 2 first (no implicit broadcasting
between rank-1 and rank-2 in ctx.ops.matmul):
lateinit var matrix: Tensor<FP32, Float>
lateinit var vector: Tensor<FP32, Float>
data(ctx) {
matrix = tensor<FP32, Float> {
shape(100, 50) { randn(mean = 0f, std = 1f) }
}
// Shape (50, 1) so matmul produces (100, 1).
vector = tensor<FP32, Float> {
shape(50, 1) { randn(mean = 0f, std = 1f) }
}
}
val result = ctx.ops.matmul(matrix, vector)
// result.data.shape == Shape(100, 1)
Transpose Before Matmul
ctx.ops.transpose produces a tensor view that the matmul dispatch
recognises; for some packed quantized formats the transpose is lazy
(no data reordering — see Q4MemorySegmentTensorData in
skainet-backend-cpu for the marker class).
val aT = ctx.ops.transpose(a)
val result = ctx.ops.matmul(b, aT)
Tensor Creation
fun oneTensor(ctx: DirectCpuExecutionContext): Tensor<FP32, Float> =
data<FP32, Float>(ctx) {
tensor {
shape(2, 2) { from(1f, 2f, 3f, 4f) }
}
}
See Build tensors with the data DSL for every construction form and initialization strategy.
Tensor Operations
fun ops(ctx: DirectCpuExecutionContext): Tensor<FP32, Float> {
lateinit var a: Tensor<FP32, Float>
lateinit var b: Tensor<FP32, Float>
data(ctx) {
a = tensor { shape(2, 3) { from(1f, 2f, 3f, 4f, 5f, 6f) } }
b = tensor { shape(3, 2) { from(1f, 0f, 0f, 1f, 1f, 1f) } }
}
val product = a.matmul(b) // [2,3] x [3,2] -> [2,2]
val transposed = product.t() // [2,2] -> [2,2]
val flat = transposed.reshape(Shape(4))
return flat.relu()
}
Broadcasting Operations
fun broadcast(ctx: DirectCpuExecutionContext): Tensor<FP32, Float> {
lateinit var matrix: Tensor<FP32, Float>
lateinit var bias: Tensor<FP32, Float>
data(ctx) {
matrix = tensor { shape(2, 3) { from(1f, 2f, 3f, 4f, 5f, 6f) } }
bias = tensor { shape(1, 3) { from(10f, 20f, 30f) } }
}
val biased = matrix + bias // [2,3] + [1,3] -> [2,3]
return biased + 100f // scalar broadcasts to every element
}
See Apply tensor operations for the full discussion.
Neural Network Examples
Defining a model
fun buildModel(ctx: DirectCpuExecutionContext) =
sequential<FP32, Float>(ctx) {
input(784) // 28x28 flattened
dense(128) { activation = { it.relu() } } // hidden layer
dense(10) { activation = { it.softmax(1) } } // class scores
}
Running inference
fun classify(pixels: FloatArray): Tensor<FP32, Float> {
val ctx = DirectCpuExecutionContext.create()
val model = buildModel(ctx)
// Shape is [batch, features]; one sample here.
val input = ctx.fromFloatArray<FP32, Float>(Shape(1, 784), FP32::class, pixels)
return model.forward(input, ctx) // [1, 10] class scores
}
Training loops
// A graph (autograd) context for training; a plain CPU context for inference.
val baseCtx = DirectCpuExecutionContext()
val trainCtx = DefaultGraphExecutionContext(
baseOps = baseCtx.ops,
phase = Phase.TRAIN,
createTapeFactory = { _ -> DefaultGradientTape() },
)
val rng = Random(42)
val model = sequential<FP32, Float>(trainCtx) {
input(2)
dense(8) { weights { randn(std = 0.5f, random = rng) } }
activation { it.tanh() }
dense(1) { weights { randn(std = 0.5f, random = rng) } }
activation { it.tanh() }
}
val x = baseCtx.fromFloatArray<FP32, Float>(Shape(n, 2), FP32::class, featuresFlat)
val y = baseCtx.fromFloatArray<FP32, Float>(Shape(n, 1), FP32::class, labelsFlat)
val runner = training<FP32, Float> {
model { model }
loss { MSELoss() }
optimizer {
sgd(lr = 0.1).apply {
model.trainableParameters().forEach { addParameter(it) }
}
}
}
var firstLoss = 0f
var lastLoss = 0f
repeat(150) { epoch ->
val loss = runner.step(trainCtx, x, y).data.get()
if (epoch == 0) firstLoss = loss
lastLoss = loss
}
Measuring accuracy
// Metric: classification accuracy on a fresh inference context.
val evalCtx = DirectCpuExecutionContext()
val preds = model.forward(x, evalCtx)
var correct = 0
for (i in 0 until n) {
val score = preds.data.get(i, 0)
val predicted = if (score >= 0f) 1f else -1f
if (predicted == labelsFlat[i]) correct++
}
val accuracy = correct.toFloat() / n