From b44807ff318e779eff8933c496fff7acde0ddf99 Mon Sep 17 00:00:00 2001 From: Fredrik Robertsen Date: Sun, 1 Feb 2026 13:56:30 +0100 Subject: [PATCH] holy what is going on --- res/{dataset.txt => dataset.csv} | 0 src/common.odin | 40 +++++ {utils => src}/linreg.odin | 95 +++++++++++- {utils => src}/linreg_test.odin | 2 +- src/main.odin | 253 ++++++++++++++++++++++++++----- 5 files changed, 344 insertions(+), 46 deletions(-) rename res/{dataset.txt => dataset.csv} (100%) create mode 100644 src/common.odin rename {utils => src}/linreg.odin (72%) rename {utils => src}/linreg_test.odin (99%) diff --git a/res/dataset.txt b/res/dataset.csv similarity index 100% rename from res/dataset.txt rename to res/dataset.csv diff --git a/src/common.odin b/src/common.odin new file mode 100644 index 0000000..b107fa1 --- /dev/null +++ b/src/common.odin @@ -0,0 +1,40 @@ +package main + +import "core:container/bit_array" + +// Knapsack +OUTPUT_FILE :: "output/data.csv" +DATA_FILE :: "res/knapPI_12_500_1000_82.csv" +NUMBER_OF_ITEMS :: 500 +CAPACITY :: 280785 + +Item :: struct { + profit, weight: int, +} + +// Feature selection +DATASET_FILE :: "res/dataset.csv" +NUMBER_OF_FEATURES :: 100 +DATASET_ROWS :: 1994 + +Dataset_Record :: struct { + features: [NUMBER_OF_FEATURES]f64, + target: f64, +} +Dataset :: #soa[DATASET_ROWS]Dataset_Record + +// GA +Chromosome :: ^bit_array.Bit_Array +Population :: [POPULATION_SIZE]Chromosome +POPULATION_SIZE :: 100 +GENERATIONS :: 100 +TOURNAMENT_SIZE :: 3 +CROSSOVER_RATE :: 0.8 +MUTATION_RATE :: 0.01 +RANDOM_SEED :: u64(42) + +// stats +Data :: struct { + best, worst: int, + mean: f32, +} diff --git a/utils/linreg.odin b/src/linreg.odin similarity index 72% rename from utils/linreg.odin rename to src/linreg.odin index f947fe9..0fc00fe 100644 --- a/utils/linreg.odin +++ b/src/linreg.odin @@ -1,4 +1,4 @@ -package utils +package main import "core:container/bit_array" import "core:math" @@ -137,6 +137,12 @@ train_test_split :: proc( test_count := int(f64(n) * test_size) train_count := n - test_count + if n == 0 || len(X[0]) == 0 { + return nil, nil, nil, nil + } + + n_features := len(X[0]) + // Create shuffled indices indices := make([]int, n) defer delete(indices) @@ -144,9 +150,10 @@ train_test_split :: proc( indices[i] = i } - // Shuffle using seed + // Shuffle rng := rand.create(random_seed) - rand.shuffle(indices[:], rand.default_random_generator(&rng)) + context.random_generator = rand.default_random_generator(&rng) + rand.shuffle(indices[:]) // Allocate splits X_train = make([][]f64, train_count) @@ -154,23 +161,26 @@ train_test_split :: proc( y_train = make([]f64, train_count) y_test = make([]f64, test_count) - // Copy training data + // Copy training data (DEEP COPY) for i in 0 ..< train_count { idx := indices[i] - X_train[i] = X[idx] + X_train[i] = make([]f64, n_features) + copy(X_train[i], X[idx]) y_train[i] = y[idx] } - // Copy test data + // Copy test data (DEEP COPY) for i in 0 ..< test_count { idx := indices[train_count + i] - X_test[i] = X[idx] + X_test[i] = make([]f64, n_features) + copy(X_test[i], X[idx]) y_test[i] = y[idx] } return } + // Extract columns based on bit_array chromosome get_columns :: proc(X: [][]f64, chrom: ^bit_array.Bit_Array) -> [][]f64 { n_rows := len(X) @@ -240,3 +250,74 @@ get_fitness :: proc( // Return RMSE return rmse(predictions, y_test) } + +// Extract selected features from dataset based on chromosome +get_selected_features :: proc(dataset: Dataset, chrom: Chromosome) -> (X: [][]f64, y: []f64) { + n_rows := len(dataset) + n_features := bit_array.len(chrom) + + // Count selected features + selected_count := 0 + for i in 0 ..< n_features { + if bit_array.get(chrom, i) { + selected_count += 1 + } + } + + if selected_count == 0 { + return nil, nil + } + + // Allocate + X = make([][]f64, n_rows) + y = make([]f64, n_rows) + + // Extract + for i in 0 ..< n_rows { + X[i] = make([]f64, selected_count) + col_idx := 0 + for j in 0 ..< n_features { + if bit_array.get(chrom, j) { + X[i][col_idx] = dataset[i].features[j] + col_idx += 1 + } + } + y[i] = dataset[i].target + } + + return X, y +} + + +// Fitness for feature selection (returns RMSE) +fitness_feature_selection :: proc( + dataset: Dataset, + chrom: Chromosome, + random_seed: u64 = 0, +) -> f64 { + X, y := get_selected_features(dataset, chrom) + if X == nil { + return math.F64_MAX + } + defer { + for row in X {delete(row)} + delete(X) + delete(y) + } + + X_train, X_test, y_train, y_test := train_test_split(X, y, 0.2, random_seed) + defer { + delete(X_train) + delete(X_test) + delete(y_train) + delete(y_test) + } + + beta := train_linear_regression(X_train, y_train) + defer delete(beta) + + predictions := predict(X_test, beta) + defer delete(predictions) + + return rmse(predictions, y_test) +} diff --git a/utils/linreg_test.odin b/src/linreg_test.odin similarity index 99% rename from utils/linreg_test.odin rename to src/linreg_test.odin index f8c7989..f8bd090 100644 --- a/utils/linreg_test.odin +++ b/src/linreg_test.odin @@ -1,4 +1,4 @@ -package utils +package main import "core:fmt" import "core:math" diff --git a/src/main.odin b/src/main.odin index 159aada..5ba22f2 100644 --- a/src/main.odin +++ b/src/main.odin @@ -10,30 +10,8 @@ import "core:slice" import "core:strconv" import "core:strings" -OUTPUT_FILE :: "output/data.csv" -DATA_FILE :: "res/knapPI_12_500_1000_82.csv" -NUMBER_OF_ITEMS :: 500 -CAPACITY :: 280785 -POPULATION_SIZE :: 100 -GENERATIONS :: 100 -TOURNAMENT_SIZE :: 3 -CROSSOVER_RATE :: 0.8 -MUTATION_RATE :: 0.01 - -Item :: struct { - profit, weight: int, -} - -Chromosome :: ^bit_array.Bit_Array -Population :: [POPULATION_SIZE]Chromosome - +dataset: Dataset items: [NUMBER_OF_ITEMS]Item - -Data :: struct { - best, worst: int, - mean: f32, -} - stats: [GENERATIONS]Data read_data :: proc(file: string) -> (res: [NUMBER_OF_ITEMS]Item, ok := true) { @@ -47,6 +25,38 @@ read_data :: proc(file: string) -> (res: [NUMBER_OF_ITEMS]Item, ok := true) { return } +load_dataset :: proc(filename: string) -> (data: Dataset, ok := true) { + file_data := os.read_entire_file(filename) or_return + defer delete(file_data) + + r: csv.Reader + csv.reader_init_with_string(&r, string(file_data)) + defer csv.reader_destroy(&r) + + r.trim_leading_space = true + r.reuse_record = true + + idx := 0 + for { + record, err := csv.read(&r) + if err != nil {break} + if idx >= DATASET_ROWS {break} + + // Parse features (columns 0-99) + for i in 0 ..< NUMBER_OF_FEATURES { + data[idx].features[i] = strconv.parse_f64(record[i]) or_return + } + + // Parse target (column 100) + data[idx].target = strconv.parse_f64(record[NUMBER_OF_FEATURES]) or_return + + idx += 1 + } + + return data, idx == DATASET_ROWS +} + + write_results :: proc(filename: string, stats: []Data) -> bool { handle, err := os.open(filename, os.O_CREATE | os.O_WRONLY | os.O_TRUNC, 0o644) if err != os.ERROR_NONE {return false} @@ -84,9 +94,13 @@ fitness :: proc(chrom: Chromosome) -> int { return tot_profit - 500 * max(tot_weight - CAPACITY, 0) } -create_random_chromosome :: proc() -> Chromosome { - chrom := bit_array.create(NUMBER_OF_ITEMS) - for i in 0 ..< NUMBER_OF_ITEMS { +fitness_rmse :: proc(chrom: Chromosome) -> f64 { + return fitness_feature_selection(dataset, chrom, RANDOM_SEED) +} + +create_random_chromosome :: proc(size: int = NUMBER_OF_ITEMS) -> Chromosome { + chrom := bit_array.create(size) + for i in 0 ..< size { bit_array.set(chrom, i, rand.int_max(2) == 1) } return chrom @@ -108,6 +122,14 @@ generate_population :: proc() -> Population { return pop } +generate_population_features :: proc() -> Population { + pop: Population + for i in 0 ..< POPULATION_SIZE { + pop[i] = create_random_chromosome(NUMBER_OF_FEATURES) + } + return pop +} + destroy_population :: proc(pop: ^Population) { for chrom in pop { bit_array.destroy(chrom) @@ -122,6 +144,14 @@ evaluate_population :: proc(pop: ^Population) -> [POPULATION_SIZE]int { return fitnesses } +evaluate_population_rmse :: proc(pop: ^Population) -> [POPULATION_SIZE]f64 { + fitnesses: [POPULATION_SIZE]f64 + for chrom, i in pop { + fitnesses[i] = fitness_rmse(chrom) + } + return fitnesses +} + tournament_selection :: proc( pop: ^Population, fitnesses: []int, @@ -141,6 +171,22 @@ tournament_selection :: proc( return pop[best_idx] } +tournament_selection_rmse :: proc(pop: ^Population, fitnesses: []f64) -> Chromosome { + best_idx := rand.int_max(POPULATION_SIZE) + best_fitness := fitnesses[best_idx] + + for _ in 1 ..< TOURNAMENT_SIZE { + idx := rand.int_max(POPULATION_SIZE) + if fitnesses[idx] < best_fitness { // Lower is better + best_idx = idx + best_fitness = fitnesses[idx] + } + } + + return pop[best_idx] +} + + roulette_selection :: proc(pop: ^Population, fitnesses: []int) -> Chromosome { total_fitness := 0 for f in fitnesses { @@ -336,6 +382,21 @@ compute_stats :: proc(fitnesses: []int) -> Data { return {best, worst, f32(sum) / f32(len(fitnesses))} } +compute_stats_rmse :: proc(fitnesses: []f64) -> [3]f64 { + best := math.F64_MAX + worst := -math.F64_MAX + sum := 0.0 + + for f in fitnesses { + best = min(best, f) // Lower is better + worst = max(worst, f) // Higher is worse + sum += f + } + + mean := sum / f64(len(fitnesses)) + return {best, mean, worst} +} + run_ga :: proc() { population := generate_population() defer destroy_population(&population) @@ -397,22 +458,138 @@ run_ga :: proc() { fmt.println("successfully wrote data to", OUTPUT_FILE) } +run_baseline :: proc() -> f64 { + all_features := bit_array.create(NUMBER_OF_FEATURES) + defer bit_array.destroy(all_features) + + // Select all features + for i in 0 ..< NUMBER_OF_FEATURES { + bit_array.set(all_features, i, true) + } + + return fitness_feature_selection(dataset, all_features, RANDOM_SEED) +} + +create_offspring_rmse :: proc(pop: ^Population, fitnesses: []f64) -> Population { + offspring: Population + + for i := 0; i < POPULATION_SIZE; i += 2 { + parent1 := tournament_selection_rmse(pop, fitnesses) + parent2 := tournament_selection_rmse(pop, fitnesses) + + child1, child2 := two_point_crossover(parent1, parent2) + + swap_mutation(child1) + if i + 1 < POPULATION_SIZE { + swap_mutation(child2) + } + + offspring[i] = child1 + if i + 1 < POPULATION_SIZE { + offspring[i + 1] = child2 + } else { + bit_array.destroy(child2) + } + } + + return offspring +} + +write_results_rmse :: proc(filename: string, stats: [][3]f64) -> bool { + handle, err := os.open(filename, os.O_CREATE | os.O_WRONLY | os.O_TRUNC, 0o644) + if err != os.ERROR_NONE {return false} + defer os.close(handle) + + w: csv.Writer + csv.writer_init(&w, os.stream_from_handle(handle)) + + csv.write(&w, []string{"Generation", "Best", "Mean", "Worst"}) + + for stat, gen in stats { + csv.write( + &w, + []string { + fmt.tprintf("%d", gen), + fmt.tprintf("%.6f", stat[0]), + fmt.tprintf("%.6f", stat[1]), + fmt.tprintf("%.6f", stat[2]), + }, + ) + } + + csv.writer_flush(&w) + return true +} + +run_ga_feature_selection :: proc() { + population := generate_population_features() + defer destroy_population(&population) + + generation_stats := make([dynamic][3]f64, 0, GENERATIONS) + defer delete(generation_stats) + + for gen in 0 ..< GENERATIONS { + fitnesses := evaluate_population_rmse(&population) + stats := compute_stats_rmse(fitnesses[:]) + append(&generation_stats, stats) + + fmt.printfln("Gen %d: Best=%.4f Mean=%.4f Worst=%.4f", gen, stats[0], stats[1], stats[2]) + + // Create offspring + offspring := create_offspring_rmse(&population, fitnesses[:]) + defer destroy_population(&offspring) + + // Replace population + destroy_population(&population) + population = offspring + } + + // Write results + write_results_rmse(OUTPUT_FILE, generation_stats[:]) + + // Final best solution + final_fitnesses := evaluate_population_rmse(&population) + best_idx := 0 + best_rmse := final_fitnesses[0] + for f, i in final_fitnesses { + if f < best_rmse { + best_rmse = f + best_idx = i + } + } + + // Count selected features + selected_count := 0 + for i in 0 ..< NUMBER_OF_FEATURES { + if bit_array.get(population[best_idx], i) { + selected_count += 1 + } + } + + fmt.printfln("\nBest solution: %d features selected, RMSE=%.4f", selected_count, best_rmse) +} + main :: proc() { - data, ok := read_data(DATA_FILE) + // Load knapsack data + knapsack_data, ok := read_data(DATA_FILE) if !ok { - fmt.eprintln("Failed to read data from", DATA_FILE) + fmt.eprintln("Failed to load knapsack data") return } - items = data + items = knapsack_data - fmt.println("Running Genetic Algorithm for Binary Knapsack Problem") - fmt.printfln( - "Items: %d, Capacity: %d, Population: %d, Generations: %d\n", - NUMBER_OF_ITEMS, - CAPACITY, - POPULATION_SIZE, - GENERATIONS, - ) + // Load feature selection dataset + feature_data, dataset_ok := load_dataset(DATASET_FILE) + if !dataset_ok { + fmt.eprintln("Failed to load dataset from:", DATASET_FILE) + return + } + dataset = feature_data - run_ga() + fmt.println("=== Baseline (All Features) ===") + baseline_rmse := run_baseline() + fmt.printfln("RMSE with all features: %.4f\n", baseline_rmse) + + fmt.println("=== GA Feature Selection ===") + run_ga_feature_selection() }