holy what is going on

2026-02-01 13:56:30 +01:00
parent 6beb7a6311
commit b44807ff31
5 changed files with 344 additions and 46 deletions
--- a/res/dataset.csv
+++ b/res/dataset.csv
--- a/src/common.odin
+++ b/src/common.odin
@@ -0,0 +1,40 @@
+package main
+
+import "core:container/bit_array"
+
+// Knapsack
+OUTPUT_FILE :: "output/data.csv"
+DATA_FILE :: "res/knapPI_12_500_1000_82.csv"
+NUMBER_OF_ITEMS :: 500
+CAPACITY :: 280785
+
+Item :: struct {
+	profit, weight: int,
+}
+
+// Feature selection
+DATASET_FILE :: "res/dataset.csv"
+NUMBER_OF_FEATURES :: 100
+DATASET_ROWS :: 1994
+
+Dataset_Record :: struct {
+	features: [NUMBER_OF_FEATURES]f64,
+	target:   f64,
+}
+Dataset :: #soa[DATASET_ROWS]Dataset_Record
+
+// GA
+Chromosome :: ^bit_array.Bit_Array
+Population :: [POPULATION_SIZE]Chromosome
+POPULATION_SIZE :: 100
+GENERATIONS :: 100
+TOURNAMENT_SIZE :: 3
+CROSSOVER_RATE :: 0.8
+MUTATION_RATE :: 0.01
+RANDOM_SEED :: u64(42)
+
+// stats
+Data :: struct {
+	best, worst: int,
+	mean:        f32,
+}
--- a/utils/linreg.odin
+++ b/utils/linreg.odin
@@ -1,4 +1,4 @@
-package utils
+package main

 import "core:container/bit_array"
 import "core:math"
@@ -137,6 +137,12 @@ train_test_split :: proc(
 	test_count := int(f64(n) * test_size)
 	train_count := n - test_count

+	if n == 0 || len(X[0]) == 0 {
+		return nil, nil, nil, nil
+	}
+
+	n_features := len(X[0])
+
 	// Create shuffled indices
 	indices := make([]int, n)
 	defer delete(indices)
@@ -144,9 +150,10 @@ train_test_split :: proc(
 		indices[i] = i
 	}

-	// Shuffle using seed
+	// Shuffle
 	rng := rand.create(random_seed)
-	rand.shuffle(indices[:], rand.default_random_generator(&rng))
+	context.random_generator = rand.default_random_generator(&rng)
+	rand.shuffle(indices[:])

 	// Allocate splits
 	X_train = make([][]f64, train_count)
@@ -154,23 +161,26 @@ train_test_split :: proc(
 	y_train = make([]f64, train_count)
 	y_test = make([]f64, test_count)

-	// Copy training data
+	// Copy training data (DEEP COPY)
 	for i in 0 ..< train_count {
 		idx := indices[i]
-		X_train[i] = X[idx]
+		X_train[i] = make([]f64, n_features)
+		copy(X_train[i], X[idx])
 		y_train[i] = y[idx]
 	}

-	// Copy test data
+	// Copy test data (DEEP COPY)
 	for i in 0 ..< test_count {
 		idx := indices[train_count + i]
-		X_test[i] = X[idx]
+		X_test[i] = make([]f64, n_features)
+		copy(X_test[i], X[idx])
 		y_test[i] = y[idx]
 	}

 	return
 }

+
 // Extract columns based on bit_array chromosome
 get_columns :: proc(X: [][]f64, chrom: ^bit_array.Bit_Array) -> [][]f64 {
 	n_rows := len(X)
@@ -240,3 +250,74 @@ get_fitness :: proc(
 	// Return RMSE
 	return rmse(predictions, y_test)
 }
+
+// Extract selected features from dataset based on chromosome
+get_selected_features :: proc(dataset: Dataset, chrom: Chromosome) -> (X: [][]f64, y: []f64) {
+	n_rows := len(dataset)
+	n_features := bit_array.len(chrom)
+
+	// Count selected features
+	selected_count := 0
+	for i in 0 ..< n_features {
+		if bit_array.get(chrom, i) {
+			selected_count += 1
+		}
+	}
+
+	if selected_count == 0 {
+		return nil, nil
+	}
+
+	// Allocate
+	X = make([][]f64, n_rows)
+	y = make([]f64, n_rows)
+
+	// Extract
+	for i in 0 ..< n_rows {
+		X[i] = make([]f64, selected_count)
+		col_idx := 0
+		for j in 0 ..< n_features {
+			if bit_array.get(chrom, j) {
+				X[i][col_idx] = dataset[i].features[j]
+				col_idx += 1
+			}
+		}
+		y[i] = dataset[i].target
+	}
+
+	return X, y
+}
+
+
+// Fitness for feature selection (returns RMSE)
+fitness_feature_selection :: proc(
+	dataset: Dataset,
+	chrom: Chromosome,
+	random_seed: u64 = 0,
+) -> f64 {
+	X, y := get_selected_features(dataset, chrom)
+	if X == nil {
+		return math.F64_MAX
+	}
+	defer {
+		for row in X {delete(row)}
+		delete(X)
+		delete(y)
+	}
+
+	X_train, X_test, y_train, y_test := train_test_split(X, y, 0.2, random_seed)
+	defer {
+		delete(X_train)
+		delete(X_test)
+		delete(y_train)
+		delete(y_test)
+	}
+
+	beta := train_linear_regression(X_train, y_train)
+	defer delete(beta)
+
+	predictions := predict(X_test, beta)
+	defer delete(predictions)
+
+	return rmse(predictions, y_test)
+}
--- a/utils/linreg_test.odin
+++ b/utils/linreg_test.odin
@@ -1,4 +1,4 @@
-package utils
+package main

 import "core:fmt"
 import "core:math"
--- a/src/main.odin
+++ b/src/main.odin
@@ -10,30 +10,8 @@ import "core:slice"
 import "core:strconv"
 import "core:strings"

-OUTPUT_FILE :: "output/data.csv"
-DATA_FILE :: "res/knapPI_12_500_1000_82.csv"
-NUMBER_OF_ITEMS :: 500
-CAPACITY :: 280785
-POPULATION_SIZE :: 100
-GENERATIONS :: 100
-TOURNAMENT_SIZE :: 3
-CROSSOVER_RATE :: 0.8
-MUTATION_RATE :: 0.01
-
-Item :: struct {
-	profit, weight: int,
-}
-
-Chromosome :: ^bit_array.Bit_Array
-Population :: [POPULATION_SIZE]Chromosome
-
+dataset: Dataset
 items: [NUMBER_OF_ITEMS]Item
-
-Data :: struct {
-	best, worst: int,
-	mean:        f32,
-}
-
 stats: [GENERATIONS]Data

 read_data :: proc(file: string) -> (res: [NUMBER_OF_ITEMS]Item, ok := true) {
@@ -47,6 +25,38 @@ read_data :: proc(file: string) -> (res: [NUMBER_OF_ITEMS]Item, ok := true) {
 	return
 }

+load_dataset :: proc(filename: string) -> (data: Dataset, ok := true) {
+	file_data := os.read_entire_file(filename) or_return
+	defer delete(file_data)
+
+	r: csv.Reader
+	csv.reader_init_with_string(&r, string(file_data))
+	defer csv.reader_destroy(&r)
+
+	r.trim_leading_space = true
+	r.reuse_record = true
+
+	idx := 0
+	for {
+		record, err := csv.read(&r)
+		if err != nil {break}
+		if idx >= DATASET_ROWS {break}
+
+		// Parse features (columns 0-99)
+		for i in 0 ..< NUMBER_OF_FEATURES {
+			data[idx].features[i] = strconv.parse_f64(record[i]) or_return
+		}
+
+		// Parse target (column 100)
+		data[idx].target = strconv.parse_f64(record[NUMBER_OF_FEATURES]) or_return
+
+		idx += 1
+	}
+
+	return data, idx == DATASET_ROWS
+}
+
+
 write_results :: proc(filename: string, stats: []Data) -> bool {
 	handle, err := os.open(filename, os.O_CREATE | os.O_WRONLY | os.O_TRUNC, 0o644)
 	if err != os.ERROR_NONE {return false}
@@ -84,9 +94,13 @@ fitness :: proc(chrom: Chromosome) -> int {
 	return tot_profit - 500 * max(tot_weight - CAPACITY, 0)
 }

-create_random_chromosome :: proc() -> Chromosome {
-	chrom := bit_array.create(NUMBER_OF_ITEMS)
-	for i in 0 ..< NUMBER_OF_ITEMS {
+fitness_rmse :: proc(chrom: Chromosome) -> f64 {
+	return fitness_feature_selection(dataset, chrom, RANDOM_SEED)
+}
+
+create_random_chromosome :: proc(size: int = NUMBER_OF_ITEMS) -> Chromosome {
+	chrom := bit_array.create(size)
+	for i in 0 ..< size {
 		bit_array.set(chrom, i, rand.int_max(2) == 1)
 	}
 	return chrom
@@ -108,6 +122,14 @@ generate_population :: proc() -> Population {
 	return pop
 }

+generate_population_features :: proc() -> Population {
+	pop: Population
+	for i in 0 ..< POPULATION_SIZE {
+		pop[i] = create_random_chromosome(NUMBER_OF_FEATURES)
+	}
+	return pop
+}
+
 destroy_population :: proc(pop: ^Population) {
 	for chrom in pop {
 		bit_array.destroy(chrom)
@@ -122,6 +144,14 @@ evaluate_population :: proc(pop: ^Population) -> [POPULATION_SIZE]int {
 	return fitnesses
 }

+evaluate_population_rmse :: proc(pop: ^Population) -> [POPULATION_SIZE]f64 {
+	fitnesses: [POPULATION_SIZE]f64
+	for chrom, i in pop {
+		fitnesses[i] = fitness_rmse(chrom)
+	}
+	return fitnesses
+}
+
 tournament_selection :: proc(
 	pop: ^Population,
 	fitnesses: []int,
@@ -141,6 +171,22 @@ tournament_selection :: proc(
 	return pop[best_idx]
 }

+tournament_selection_rmse :: proc(pop: ^Population, fitnesses: []f64) -> Chromosome {
+	best_idx := rand.int_max(POPULATION_SIZE)
+	best_fitness := fitnesses[best_idx]
+
+	for _ in 1 ..< TOURNAMENT_SIZE {
+		idx := rand.int_max(POPULATION_SIZE)
+		if fitnesses[idx] < best_fitness { 	// Lower is better
+			best_idx = idx
+			best_fitness = fitnesses[idx]
+		}
+	}
+
+	return pop[best_idx]
+}
+
+
 roulette_selection :: proc(pop: ^Population, fitnesses: []int) -> Chromosome {
 	total_fitness := 0
 	for f in fitnesses {
@@ -336,6 +382,21 @@ compute_stats :: proc(fitnesses: []int) -> Data {
 	return {best, worst, f32(sum) / f32(len(fitnesses))}
 }

+compute_stats_rmse :: proc(fitnesses: []f64) -> [3]f64 {
+	best := math.F64_MAX
+	worst := -math.F64_MAX
+	sum := 0.0
+
+	for f in fitnesses {
+		best = min(best, f) // Lower is better
+		worst = max(worst, f) // Higher is worse
+		sum += f
+	}
+
+	mean := sum / f64(len(fitnesses))
+	return {best, mean, worst}
+}
+
 run_ga :: proc() {
 	population := generate_population()
 	defer destroy_population(&population)
@@ -397,22 +458,138 @@ run_ga :: proc() {
 	fmt.println("successfully wrote data to", OUTPUT_FILE)
 }

+run_baseline :: proc() -> f64 {
+	all_features := bit_array.create(NUMBER_OF_FEATURES)
+	defer bit_array.destroy(all_features)
+
+	// Select all features
+	for i in 0 ..< NUMBER_OF_FEATURES {
+		bit_array.set(all_features, i, true)
+	}
+
+	return fitness_feature_selection(dataset, all_features, RANDOM_SEED)
+}
+
+create_offspring_rmse :: proc(pop: ^Population, fitnesses: []f64) -> Population {
+	offspring: Population
+
+	for i := 0; i < POPULATION_SIZE; i += 2 {
+		parent1 := tournament_selection_rmse(pop, fitnesses)
+		parent2 := tournament_selection_rmse(pop, fitnesses)
+
+		child1, child2 := two_point_crossover(parent1, parent2)
+
+		swap_mutation(child1)
+		if i + 1 < POPULATION_SIZE {
+			swap_mutation(child2)
+		}
+
+		offspring[i] = child1
+		if i + 1 < POPULATION_SIZE {
+			offspring[i + 1] = child2
+		} else {
+			bit_array.destroy(child2)
+		}
+	}
+
+	return offspring
+}
+
+write_results_rmse :: proc(filename: string, stats: [][3]f64) -> bool {
+	handle, err := os.open(filename, os.O_CREATE | os.O_WRONLY | os.O_TRUNC, 0o644)
+	if err != os.ERROR_NONE {return false}
+	defer os.close(handle)
+
+	w: csv.Writer
+	csv.writer_init(&w, os.stream_from_handle(handle))
+
+	csv.write(&w, []string{"Generation", "Best", "Mean", "Worst"})
+
+	for stat, gen in stats {
+		csv.write(
+			&w,
+			[]string {
+				fmt.tprintf("%d", gen),
+				fmt.tprintf("%.6f", stat[0]),
+				fmt.tprintf("%.6f", stat[1]),
+				fmt.tprintf("%.6f", stat[2]),
+			},
+		)
+	}
+
+	csv.writer_flush(&w)
+	return true
+}
+
+run_ga_feature_selection :: proc() {
+	population := generate_population_features()
+	defer destroy_population(&population)
+
+	generation_stats := make([dynamic][3]f64, 0, GENERATIONS)
+	defer delete(generation_stats)
+
+	for gen in 0 ..< GENERATIONS {
+		fitnesses := evaluate_population_rmse(&population)
+		stats := compute_stats_rmse(fitnesses[:])
+		append(&generation_stats, stats)
+
+		fmt.printfln("Gen %d: Best=%.4f Mean=%.4f Worst=%.4f", gen, stats[0], stats[1], stats[2])
+
+		// Create offspring
+		offspring := create_offspring_rmse(&population, fitnesses[:])
+		defer destroy_population(&offspring)
+
+		// Replace population
+		destroy_population(&population)
+		population = offspring
+	}
+
+	// Write results
+	write_results_rmse(OUTPUT_FILE, generation_stats[:])
+
+	// Final best solution
+	final_fitnesses := evaluate_population_rmse(&population)
+	best_idx := 0
+	best_rmse := final_fitnesses[0]
+	for f, i in final_fitnesses {
+		if f < best_rmse {
+			best_rmse = f
+			best_idx = i
+		}
+	}
+
+	// Count selected features
+	selected_count := 0
+	for i in 0 ..< NUMBER_OF_FEATURES {
+		if bit_array.get(population[best_idx], i) {
+			selected_count += 1
+		}
+	}
+
+	fmt.printfln("\nBest solution: %d features selected, RMSE=%.4f", selected_count, best_rmse)
+}
+
 main :: proc() {
-	data, ok := read_data(DATA_FILE)
+	// Load knapsack data
+	knapsack_data, ok := read_data(DATA_FILE)
 	if !ok {
-		fmt.eprintln("Failed to read data from", DATA_FILE)
+		fmt.eprintln("Failed to load knapsack data")
 		return
 	}
-	items = data
+	items = knapsack_data

-	fmt.println("Running Genetic Algorithm for Binary Knapsack Problem")
-	fmt.printfln(
-		"Items: %d, Capacity: %d, Population: %d, Generations: %d\n",
-		NUMBER_OF_ITEMS,
-		CAPACITY,
-		POPULATION_SIZE,
-		GENERATIONS,
-	)
+	// Load feature selection dataset
+	feature_data, dataset_ok := load_dataset(DATASET_FILE)
+	if !dataset_ok {
+		fmt.eprintln("Failed to load dataset from:", DATASET_FILE)
+		return
+	}
+	dataset = feature_data

-	run_ga()
+	fmt.println("=== Baseline (All Features) ===")
+	baseline_rmse := run_baseline()
+	fmt.printfln("RMSE with all features: %.4f\n", baseline_rmse)
+
+	fmt.println("=== GA Feature Selection ===")
+	run_ga_feature_selection()
 }