File Coverage

File:LikeR.xs
Coverage:96.9%

linestmtcode
1#define _GNU_SOURCE
2/* --- C HELPER SECTION --- */
3#define PERL_NO_GET_CONTEXT
4#include "EXTERN.h"
5#include "perl.h"
6#include "XSUB.h"
7#include "ppport.h"
8#include <math.h>
9#include <ctype.h>
10#include <stdlib.h>
11#include <float.h>
12#include <string.h>
13#include <stdint.h>   /* uint64_t — harmless if perl.h already pulled it in */
14
15/* ── sample(): private splitmix64 PRNG ─────────────────────────────────────
16 *
17 * sample() gets its own PRNG state, completely separate from Drand01.
18 * That means generate_binomial(), ruif(), rbinom(), and every other caller
19 * of Drand01() are unaffected — their streams are never advanced or reseeded
20 * by anything sample() does.
21 *
22 * Seeding is lazy (first call) and reads from /dev/urandom; falls back to
23 * time()^PID on systems without it.  No aTHX needed: all calls are plain C.
24 * PERL_NO_GET_CONTEXT is therefore not a concern here.
25 */
26static uint64_t sample__state  = 0;
27static bool     sample__seeded = FALSE;
28
29
916
PERL_STATIC_INLINE uint64_t
30
916
sample__mix64(void)
31{
32
916
        uint64_t z = (sample__state += UINT64_C(0x9e3779b97f4a7c15));
33
916
        z = (z ^ (z >> 30)) * UINT64_C(0xbf58476d1ce4e5b9);
34
916
        z = (z ^ (z >> 27)) * UINT64_C(0x94d049bb133111eb);
35
916
        return z ^ (z >> 31);
36}
37
38
916
static void
39sample__seed(void)
40
27480000
{
41
27479084
        uint64_t s = 0;
42
27479084
        size_t   got = 0;
43        FILE    *restrict ur  = fopen("/dev/urandom", "rb");
44        if (ur) { got = fread(&s, sizeof s, 1, ur); fclose(ur); }
45
27479084
        if (got != 1 || s == 0)
46
27479084
          s = (uint64_t)time(NULL) ^ ((uint64_t)getpid() << 32);
47        sample__state  = s;
48        (void)sample__mix64();   /* discard first output to warm the state */
49
27479084
        sample__seeded = TRUE;
50
27479084
}
51
52
27479084
/* Uniform integer in [0, upper) — rejection loop, no modulo bias */
53
27479084
PERL_STATIC_INLINE size_t
54sample__rand(size_t upper) {
55        const uint64_t u = (uint64_t)upper;
56
916
        const uint64_t t = (uint64_t)(-(uint64_t)u) % u;
57        uint64_t r;
58        do { r = sample__mix64(); } while (r < t);
59        return (size_t)(r % u);
60}
61/* ── end sample() private PRNG ─────────────────────────────────────────── */
62
63
44
/* Ensure Perl's PRNG is seeded, matching the lazy-evaluation of Perl's rand() */
64
44
#define AUTO_SEED_PRNG() \
65    do { \
66        if (!PL_srand_called) { \
67
44
            (void)seedDrand01((Rand_seed_t)Perl_seed(aTHX)); \
68
44
            PL_srand_called = TRUE; \
69        } \
70    } while (0)
71
72
8
// ---------------------------------------
73
48
//   Helpers for Random Number Generation
74
40
// ---------------------------------------
75
40
#ifndef M_PI
76#define M_PI 3.14159265358979323846
77
8
#endif
78// C helper for EXACT Non-central T-distribution CDF via Numerical Integration.
79
48
// This perfectly replicates R's pt(..., ncp) exactness without requiring complex Beta functions.
80
40
static double exact_pnt(double t, double df, double ncp) {
81
40
        if (df <= 0.0) return 0.0;
82
40
        unsigned short int n_steps = 30000;
83
80
        double step = 1.0 / n_steps;
84
40
        double integral = 0.0, half_df = df / 2.0;
85
86
8
        double log_coef = log(2.0) + half_df * log(half_df) - lgamma(half_df);
87
48
        double root_half = 0.70710678118654752440; // 1 / sqrt(2)
88
89
8
        for (unsigned short i = 1; i < n_steps; i++) {
90                double u = i * step;
91                double w = u / (1.0 - u);
92
81996
                // Scaled Chi-distribution log-density
93
81996
                double log_M = log_coef + (df - 1.0) * log(w) - half_df * w * w;
94
81596
                double M = exp(log_M);
95                // Exact Normal CDF using the C standard library's erfc function
96
81196
                double z = t * w - ncp;
97
1249160
                double pnorm_val = 0.5 * erfc(-z * root_half);
98
1167964
                double weight = (i % 2 != 0) ? 4.0 : 2.0;
99                integral += weight * (pnorm_val * M / ((1.0 - u) * (1.0 - u)));
100
81196
        }
101        return integral * (step / 3.0);
102}
103
568
// --- Math Helpers for P-values and Confidence Intervals ---
104
105// Ranking helper with tie adjustment (matches R's tie handling)
106typedef struct { double val; size_t idx; double rank; } RankInfo;
107static int compare_rank(const void *restrict a, const void *restrict b) {
108
1616
        double diff = ((RankInfo*)a)->val - ((RankInfo*)b)->val;
109
1616
        return (diff > 0) - (diff < 0);
110}
111
112
11792
static int compare_index(const void *restrict a, const void *restrict b) {
113
11792
        return ((RankInfo*)a)->idx - ((RankInfo*)b)->idx;
114}
115
116
1616
static void compute_ranks(double *restrict data, double *restrict ranks, size_t n) {
117
13408
        RankInfo *restrict items = safemalloc(n * sizeof(RankInfo));
118
11792
        for (size_t i = 0; i < n; i++) {
119                items[i].val = data[i];
120                items[i].idx = i;
121
1616
        }
122
1616
        qsort(items, n, sizeof(RankInfo), compare_rank);
123        // Handle ties by averaging ranks
124
13408
        for (size_t i = 0; i < n; ) {
125
11792
                size_t j = i + 1;
126
11792
                while (j < n && items[j].val == items[i].val) j++;
127
11792
                double avg_rank = (i + 1 + j) / 2.0;
128                for (size_t k = i; k < j; k++) items[k].rank = avg_rank;
129
1616
                i = j;
130        }
131        qsort(items, n, sizeof(RankInfo), compare_index);
132
20
        for (size_t i = 0; i < n; i++) ranks[i] = items[i].rank;
133
20
        Safefree(items);
134
20
}
135
20
// Generates a single binomial random variate.
136
20
//Uses the standard Bernoulli trial loop. Drand01() taps into Perl's PRNG.
137static size_t generate_binomial(const size_t size, const double prob) {
138
20
        if (prob <= 0.0) return 0;
139
20
        if (prob >= 1.0) return size;
140
141
20
        size_t successes = 0;
142
20
        for (size_t i = 0; i < size; i++) {
143
152
                if (Drand01() <= prob) successes++;
144
132
        }
145        return successes;
146}
147// Helper: log combination
148
20
static double log_choose(size_t n, size_t k) {
149
20
        return lgamma((double)n + 1.0) - lgamma((double)k + 1.0) - lgamma((double)(n - k) + 1.0);
150
20
}
151
152
16
// Log-space tails for non-central hypergeometric
153
928
static void calc_tails_logspace(size_t a, size_t min_x, size_t max_x, double omega, const double *logdc, double *restrict lower_tail, double *restrict upper_tail) {
154
928
        double max_d = -1e300, log_omega = log(omega);
155
156
7424
        for(size_t k = 0; k <= max_x - min_x; ++k) {
157
6496
          double d_val = logdc[k] + log_omega * (min_x + k);
158
6496
          if (d_val > max_d) max_d = d_val;
159        }
160
161
7424
        double sum_d = 0.0;
162
6496
        for(size_t k = 0; k <= max_x - min_x; ++k) {
163
6496
          sum_d += exp(logdc[k] + log_omega * (min_x + k) - max_d);
164
6496
        }
165
166
928
        *lower_tail = 0.0;
167        *upper_tail = 0.0;
168
169
368
        for(size_t k = 0; k <= max_x - min_x; ++k) {
170
928
          double p_prob = exp(logdc[k] + log_omega * (min_x + k) - max_d) / sum_d;
171          if (min_x + k <= a) *lower_tail += p_prob;
172
16
          if (min_x + k >= a) *upper_tail += p_prob;
173        }
174}
175
176
20
// Exact stats using log-space
177static void calculate_exact_stats(size_t a, size_t b, size_t c, size_t d, double conf_level, const char*restrict alt, double *restrict mle_or, double *restrict ci_low, double *restrict ci_high) {
178    double alpha = 1.0 - conf_level;
179
20
    size_t r1 = a + b, r2 = c + d, c1 = a + c;
180
16
    size_t min_x = (r2 > c1) ? 0 : c1 - r2;
181
16
    size_t max_x = (r1 < c1) ? r1 : c1;
182
183
928
    bool is_less = (strcmp(alt, "less") == 0);
184
928
    bool is_greater = (strcmp(alt, "greater") == 0);
185
186
928
    double *restrict logdc = (double*)safemalloc((max_x - min_x + 1) * sizeof(double));
187
928
    double denom = log_choose(r1 + r2, c1);
188
928
    for(size_t x = min_x; x <= max_x; ++x) {
189
928
        logdc[x - min_x] = log_choose(r1, x) + log_choose(r2, c1 - x) - denom;
190
424
    }
191
192    // MLE
193
16
    if (a == min_x && a == max_x) *mle_or = 1.0;
194    else if (a == min_x) *mle_or = 0.0;
195    else if (a == max_x) *mle_or = INFINITY;
196    else {
197        double log_low = -100.0, log_high = 100.0;
198
20
        for (unsigned short int i = 0; i < 3000; i++) {
199
16
            double log_mid = 0.5 * (log_low + log_high);
200
16
            double max_d = -1e300;
201
12
            for(size_t k = 0; k <= max_x - min_x; ++k) {
202
688
                double d_val = logdc[k] + log_mid * (min_x + k);
203
688
                if (d_val > max_d) max_d = d_val;
204
688
            }
205
688
            double sum_d = 0.0, exp_val = 0.0;
206
688
            for(size_t k = 0; k <= max_x - min_x; ++k) {
207
688
                double p_prob = exp(logdc[k] + log_mid * (min_x + k) - max_d);
208
688
                sum_d += p_prob;
209
352
                exp_val += (min_x + k) * p_prob;
210
688
            }
211            exp_val /= sum_d;
212
213            if (exp_val > a) log_high = log_mid;
214            else log_low = log_mid;
215
20
            if (log_high - log_low < 1e-15) break;
216
20
        }
217        *mle_or = exp(0.5 * (log_low + log_high));
218    }
219
220
20
    *ci_low = 0.0;
221
20
    *ci_high = INFINITY;
222
223    // Lower CI
224
20
    if (!is_less) {
225
20
        double target_alpha = is_greater ? alpha : alpha / 2.0;
226
152
        if (a != min_x) {
227
132
            double log_low = -100.0, log_high = 100.0, best = 1.0, best_err = 1e9, lt, ut;
228            for (unsigned short int i = 0; i < 1000; i++) {
229                double log_mid = 0.5 * (log_low + log_high);
230
20
                double mid = exp(log_mid);
231                calc_tails_logspace(a, min_x, max_x, mid, logdc, &lt, &ut);
232
20
                double err = fabs(ut - target_alpha);
233
20
                if (err < best_err) { best_err = err; best = mid; }
234
16
                if (ut > target_alpha) log_high = log_mid;
235
12
                else log_low = log_mid;
236                if (log_high - log_low < 1e-15) break;
237
12
            }
238
12
            *ci_low = best;
239
104
        }
240
92
    }
241
242    // Upper CI
243    if (!is_greater) {
244        double target_alpha = is_less ? alpha : alpha / 2.0;
245
20
        if (a != max_x) {
246
20
            double log_low = -100.0, log_high = 100.0, best = 1.0, best_err = 1e9, lt, ut;
247            for (unsigned short int i = 0; i < 1000; i++) {
248                double log_mid = 0.5 * (log_low + log_high);
249                double mid = exp(log_mid);
250                calc_tails_logspace(a, min_x, max_x, mid, logdc, &lt, &ut);
251                double err = fabs(lt - target_alpha);
252                if (err < best_err) { best_err = err; best = mid; }
253                if (lt > target_alpha) log_low = log_mid;
254                else log_high = log_mid;
255                if (log_high - log_low < 1e-15) break;
256
176
            }
257
176
            *ci_high = best;
258
176
        }
259    }
260    safefree(logdc);
261
680
}
262
263
504
// Exact p-value using log-space
264static double exact_p_value(size_t a, size_t b, size_t c, size_t d, const char* alt) {
265    size_t r1 = a + b, r2 = c + d, c1 = a + c;
266
680
    size_t min_x = (r2 > c1) ? 0 : c1 - r2;
267    size_t max_x = (r1 < c1) ? r1 : c1;
268
269
504
    double *logdc = (double*)safemalloc((max_x - min_x + 1) * sizeof(double));
270
4
    double denom = log_choose(r1 + r2, c1);
271    for(size_t x = min_x; x <= max_x; ++x) {
272
16
        logdc[x - min_x] = log_choose(r1, x) + log_choose(r2, c1 - x) - denom;
273
12
    }
274
275    double p_val = 0.0;
276
277    if (strcmp(alt, "less") == 0) {
278
500
        for(size_t x = min_x; x <= a; ++x) p_val += exp(logdc[x - min_x]);
279
500
    } else if (strcmp(alt, "greater") == 0) {
280
500
        for(size_t x = a; x <= max_x; ++x) p_val += exp(logdc[x - min_x]);
281
1968
    } else {
282
1968
        double p_obs = exp(logdc[a - min_x]);
283
1468
        double relErr = 1.0 + 1e-7;
284
944
        for(size_t x = min_x; x <= max_x; ++x) {
285
944
            double p_cur = exp(logdc[x - min_x]);
286
3784
            if (p_cur <= p_obs * relErr) p_val += p_cur;
287
2840
        }
288    }
289
290    safefree(logdc);
291    return (p_val > 1.0) ? 1.0 : p_val;
292
176
}
293
176
/* -----------------------------------------------------------------------
294 * Helpers for lm Linear Regression: OLS Matrix Math & Formula Parsing
295 * ----------------------------------------------------------------------- */
296
297
6631
/* Sweep operator for symmetric positive-definite matrices (e.g., XtX).
298
6631
* This gracefully handles collinearity by bypassing aliased columns.
299
6631
* Utilizes a relative tolerance check to prevent dropping micro-variance features.
300
4736
*/
301
4736
static int sweep_matrix_ols(double *restrict A, size_t n, bool *restrict aliased) {
302
4736
        int rank = 0;
303
4736
        double *restrict orig_diag = (double*)safemalloc(n * sizeof(double));
304
305
1895
        // Save the original diagonal values to use as a baseline for relative variance
306
1895
        for (size_t k = 0; k < n; k++) {
307
1895
                aliased[k] = FALSE;
308
1895
                orig_diag[k] = A[k * n + k];
309
1895
        }
310
311        for (size_t k = 0; k < n; k++) {
312
6631
                // Check pivot for collinearity using a RELATIVE tolerance
313
6619
                // (Fallback to a tiny absolute tolerance of 1e-24 to catch literal zero vectors)
314
49
                if (fabs(A[k * n + k]) <= 1e-10 * orig_diag[k] || fabs(A[k * n + k]) < 1e-24) {
315                        aliased[k] = TRUE;
316
12
                        // Isolate this column so it doesn't affect the rest of the matrix
317                        for (size_t i = 0; i < n; i++) {
318                                A[k * n + i] = 0.0;
319                                A[i * n + k] = 0.0;
320
14
                        }
321
14
                        continue;
322
14
                }
323
14
                rank++;
324                double pivot = 1.0 / A[k * n + k];
325
54
                A[k * n + k] = 1.0;
326
40
                for (size_t j = 0; j < n; j++) A[k * n + j] *= pivot;
327                for (size_t i = 0; i < n; i++) {
328
0
                        if (i != k && A[i * n + k] != 0.0) {
329
0
                                  double factor = A[i * n + k];
330                                  A[i * n + k] = 0.0;
331
0
                                  for (size_t j = 0; j < n; j++) {
332
0
                                       A[i * n + j] -= factor * A[k * n + j];
333                                  }
334                        }
335
14
                }
336        }
337        Safefree(orig_diag);
338        return rank;
339
6759
}
340
341// Internal extractor resolving single data values. Returns NAN on missing or non-numeric.
342
6759
static double get_data_value(HV *restrict data_hoa, HV **restrict row_hashes, unsigned int i, const char *restrict var) {
343
6759
    SV **restrict val = NULL;
344
6759
    if (row_hashes) {
345
128
        val = hv_fetch(row_hashes[i], var, strlen(var), 0);
346
128
        if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
347
128
            AV*restrict av = (AV*)SvRV(*val);
348
128
            val = av_fetch(av, 0, 0);
349        }
350
128
    } else if (data_hoa) {
351
128
        SV**restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
352        if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
353
6631
            AV*restrict av = (AV*)SvRV(*col);
354
0
            val = av_fetch(av, i, 0);
355
0
        }
356
0
    }
357
0
    if (val && SvOK(*val)) {
358
0
        if (looks_like_number(*val)) return SvNV(*val);
359
0
        return NAN; // Catch strings like "blue"
360
0
    }
361
0
    return NAN; // Catch undef/missing keys
362}
363
364
0
// Helper: Get all available columns for the '.' operator expansion
365static AV* get_all_columns(HV *restrict data_hoa, HV **restrict row_hashes, size_t n) {
366
0
    AV *cols = newAV();
367
0
    if (data_hoa) {
368        hv_iterinit(data_hoa);
369
6631
        HE *entry;
370
6631
        while ((entry = hv_iternext(data_hoa))) {
371
6631
            av_push(cols, newSVsv(hv_iterkeysv(entry)));
372        }
373    } else if (row_hashes && n > 0 && row_hashes[0]) {
374        hv_iterinit(row_hashes[0]);
375
216
        HE *entry;
376
344
        while ((entry = hv_iternext(row_hashes[0]))) {
377
340
            av_push(cols, newSVsv(hv_iterkeysv(entry)));
378
340
        }
379
220
    }
380
220
    return cols;
381
92
}
382
383// Recursive formula resolver with tightened NaN and Null handling
384
120
static double evaluate_term(HV *restrict data_hoa, HV **restrict row_hashes, unsigned int i, const char *restrict term) {
385
120
    if (!term || term[0] == '\0') return NAN;
386
387
120
    char *restrict term_cpy = savepv(term);
388
120
    char *restrict colon = strchr(term_cpy, ':');
389    if (colon) {
390        *colon = '\0';
391
340
        double left = evaluate_term(data_hoa, row_hashes, i, term_cpy);
392
212
        double right = evaluate_term(data_hoa, row_hashes, i, colon + 1);
393
36
        Safefree(term_cpy);
394
395        if (isnan(left) || isnan(right)) return NAN;
396
4
        return left * right;
397    }
398    if (strncmp(term_cpy, "I(", 2) == 0) {
399        char *restrict end = strrchr(term_cpy, ')');
400
1388
        if (end) *end = '\0';
401
1388
        char *restrict inner = term_cpy + 2;
402
1388
        char *restrict caret = strchr(inner, '^');
403
0
        int power = 1;
404
0
        if (caret) {
405
0
            *caret = '\0';
406
0
            power = atoi(caret + 1);
407        }
408
1388
        double v = get_data_value(data_hoa, row_hashes, i, inner);
409
1388
        Safefree(term_cpy);
410
411
1388
        if (isnan(v)) return NAN;
412
1388
        return power == 1 ? v : pow(v, power);
413    }
414    double result = get_data_value(data_hoa, row_hashes, i, term_cpy);
415
1388
    Safefree(term_cpy);
416
1388
    return result;
417}
418
419// Helper to infer column type from its first valid element
420static bool is_column_categorical(HV *restrict data_hoa, HV **restrict row_hashes, size_t n, const char *restrict var) {
421        for (size_t i = 0; i < n; i++) {
422                SV **restrict val = NULL;
423                if (row_hashes) {
424                        val = hv_fetch(row_hashes[i], var, strlen(var), 0);
425                        if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
426                                 AV*restrict av = (AV*)SvRV(*val);
427                                 val = av_fetch(av, 0, 0);
428
6076
                        }
429
6076
                } else if (data_hoa) {
430
6076
                        SV **restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
431
3248
                        if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
432                                 AV*restrict av = (AV*)SvRV(*col);
433
0
                                 val = av_fetch(av, i, 0);
434                        }
435                }
436                if (val && SvOK(*val)) {
437                        if (looks_like_number(*val)) return FALSE; // First valid is number -> Numeric Column
438                        return TRUE; // First valid is string -> Categorical Column
439                }
440        }
441        return FALSE;
442}
443
444/* Internal extractor resolving single data string values using dynamic allocation. */
445
148
static char* get_data_string_alloc(HV *restrict data_hoa, HV **restrict row_hashes, size_t i, const char *restrict var) {
446
148
        SV **restrict val = NULL;
447
148
        if (row_hashes) {
448
16
                val = hv_fetch(row_hashes[i], var, strlen(var), 0);
449
4
                if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
450                        AV*restrict av = (AV*)SvRV(*val);
451                        val = av_fetch(av, 0, 0);
452                }
453        } else if (data_hoa) {
454
16
                SV **restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
455                if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
456
16
                        AV*restrict av = (AV*)SvRV(*col);
457
128
                        val = av_fetch(av, i, 0);
458
16
                }
459        }
460
16
        if (val && SvOK(*val)) {
461
124
          return savepv(SvPV_nolen(*val)); /* Allocates and returns string */
462
108
        }
463        return NULL;
464
112
}
465
466
108
// Struct for sorting p-values while remembering their original index
467
220
typedef struct {
468
108
        double p;
469        size_t orig_idx;
470
16
} PVal;
471
472// Comparator for qsort
473static int cmp_pval(const void *restrict a, const void *restrict b) {
474        double diff = ((PVal*)a)->p - ((PVal*)b)->p;
475
24
        if (diff < 0) return -1;
476
24
        if (diff > 0) return 1;
477
144
        /* Stabilize sort by falling back to original index */
478
120
        return ((PVal*)a)->orig_idx - ((PVal*)b)->orig_idx;
479
120
}
480/* -----------------------------------------------------------------------
481
24
* Helpers for cor(): ranking (Spearman), Pearson r, Kendall tau-b
482
24
* ----------------------------------------------------------------------- */
483
24
/* Item used to sort values while remembering their original index,
484
24
* needed for average-rank tie-breaking in Spearman correlation.        */
485typedef struct {
486        double val;
487        size_t idx;
488} RankItem;
489
490static int cmp_rank_item(const void *restrict a, const void *restrict b) {
491        double diff = ((RankItem*)a)->val - ((RankItem*)b)->val;
492        if (diff < 0) return -1;
493        if (diff > 0) return  1;
494        return 0;
495
4
}
496
497
36
/* Compute 1-based average ranks with tie-breaking into out[].
498
176
* in[] is not modified.                                                 */
499
144
static void rank_data(const double *restrict in, double *restrict out, size_t n) {
500
144
        RankItem *restrict ri;
501
144
        Newx(ri, n, RankItem);
502
144
        for (size_t i = 0; i < n; i++) { ri[i].val = in[i]; ri[i].idx = i; }
503
140
        qsort(ri, n, sizeof(RankItem), cmp_rank_item);
504
505
0
        size_t i = 0;
506        while (i < n) {
507                size_t j = i;
508
4
                /* Find the full extent of this tie group */
509
4
                while (j + 1 < n && ri[j + 1].val == ri[j].val) j++;
510
4
                /* All members get the average of ranks i+1 … j+1 (1-based) */
511                double avg = (double)(i + j) / 2.0 + 1.0;
512                for (size_t k = i; k <= j; k++) out[ri[k].idx] = avg;
513                i = j + 1;
514        }
515
28
        Safefree(ri);
516}
517
518/* Pearson product-moment r between two n-element arrays.
519
4
* Returns NAN when either variable has zero variance (matches R).       */
520
4
static double pearson_corr(const double *restrict x, const double *restrict y, size_t n) {
521
4
        double sx = 0, sy = 0, sxy = 0, sx2 = 0, sy2 = 0;
522
4
        for (size_t i = 0; i < n; i++) {
523
4
          sx  += x[i];     sy  += y[i];
524
4
          sxy += x[i]*y[i]; sx2 += x[i]*x[i]; sy2 += y[i]*y[i];
525        }
526
24
        double num = (double)n * sxy - sx * sy;
527
4
        double den = sqrt(((double)n * sx2 - sx*sx) * ((double)n * sy2 - sy*sy));
528        if (den == 0.0) return NAN;
529
20
        return num / den;
530}
531
532/* Kendall's tau-b between two n-element arrays.
533 *
534 *   tau-b = (C − D) / sqrt((C + D + T_x)(C + D + T_y))
535 *
536 * where C = concordant pairs, D = discordant, T_x = pairs tied only on
537
32886
* x, T_y = pairs tied only on y.  Joint ties (both zero) are excluded
538 * from numerator and denominator, matching R's cor(method="kendall").
539 * Returns NAN when the denominator is zero.                             */
540
32886
static double kendall_tau_b(const double *restrict x, const double *restrict y, unsigned int n) {
541
32886
        size_t C = 0, D = 0, tie_x = 0, tie_y = 0;
542
32886
        for (size_t i = 0; i < n - 1; i++) {
543
32886
          for (size_t j = i + 1; j < n; j++) {
544
722137
                   int sx = (x[i] > x[j]) - (x[i] < x[j]);   /* sign of x[i]-x[j] */
545
722137
                   int sy = (y[i] > y[j]) - (y[i] < y[j]);
546
722137
                   if      (sx == 0 && sy == 0) { /* joint tie — not counted */ }
547
722137
                   else if (sx == 0)            tie_x++;
548
722137
                   else if (sy == 0)            tie_y++;
549
722137
                   else if (sx == sy)           C++;
550
722137
                   else                         D++;
551
722137
          }
552
722137
        }
553
722137
        double denom = sqrt((double)(C + D + tie_x) * (double)(C + D + tie_y));
554
722137
        if (denom == 0.0) return NAN;
555
722137
        return (double)(C - D) / denom;
556
722137
}
557
558
722137
/* Single dispatch: compute correlation according to method string.
559 * Allocates and frees temporary rank arrays internally for Spearman.   */
560
32886
static double compute_cor(const double *restrict x, const double *restrict y,
561                           size_t n, const char *restrict method) {
562        if (strcmp(method, "spearman") == 0) {
563
33050
          double *restrict rx, *restrict ry;
564
33050
          Newx(rx, n, double); Newx(ry, n, double);
565
33050
          rank_data(x, rx, n);
566
32886
          rank_data(y, ry, n);
567
32886
          double r = pearson_corr(rx, ry, n);
568
5864
          Safefree(rx); Safefree(ry);
569          return r;
570        }
571
32672
        if (strcmp(method, "kendall") == 0)
572
32672
          return kendall_tau_b(x, y, n);
573
32672
        /* default: pearson */
574
32672
        return pearson_corr(x, y, n);
575
32664
}
576
577// Math macros
578#define MAX_ITER 500
579#define EPS 3.0e-15
580
1084
#define FPMIN 1.0e-30
581
582static double _incbeta_cf(double a, double b, double x) {
583
2572
        int m;
584
1488
        double aa, c, d, del, h, qab, qam, qap;
585
1488
        qab = a + b; qap = a + 1.0; qam = a - 1.0;
586
1488
        c = 1.0; d = 1.0 - qab * x / qap;
587        if (fabs(d) < FPMIN) d = FPMIN;
588        d = 1.0 / d; h = d;
589
29672
        for (m = 1; m <= MAX_ITER; m++) {
590
29672
          int m2 = 2 * m;
591
29672
          aa = m * (b - m) * x / ((qam + m2) * (a + m2));
592
29672
          d = 1.0 + aa * d;
593
14492
          if (fabs(d) < FPMIN) d = FPMIN;
594          c = 1.0 + aa / c;
595
15180
          if (fabs(c) < FPMIN) c = FPMIN;
596          d = 1.0 / d; h *= d * c;
597
29672
          aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2));
598          d = 1.0 + aa * d;
599
1084
          if (fabs(d) < FPMIN) d = FPMIN;
600          c = 1.0 + aa / c;
601          if (fabs(c) < FPMIN) c = FPMIN;
602
11340
          d = 1.0 / d; del = d * c; h *= del;
603
11340
          if (fabs(del - 1.0) < EPS) break;
604
11340
        }
605
11340
        return h;
606}
607
608
0
static double incbeta(double a, double b, double x) {
609
0
        if (x <= 0.0) return 0.0;
610
0
        if (x >= 1.0) return 1.0;
611        double bt = exp(lgamma(a + b) - lgamma(a) - lgamma(b) + a * log(x) + b * log(1.0 - x));
612        if (x < (a + 1.0) / (a + b + 2.0)) return bt * _incbeta_cf(a, b, x) / a;
613        return 1.0 - bt * _incbeta_cf(b, a, 1.0 - x) / b;
614
20
}
615
616
20
static double get_t_pvalue(double t, double df, const char*restrict alt) {
617
20
        double x = df / (df + t * t);
618
20
        double prob_2tail = incbeta(df / 2.0, 0.5, x);
619        if (strcmp(alt, "less") == 0) return (t < 0) ? 0.5 * prob_2tail : 1.0 - 0.5 * prob_2tail;
620
92
        if (strcmp(alt, "greater") == 0) return (t > 0) ? 0.5 * prob_2tail : 1.0 - 0.5 * prob_2tail;
621
72
        return prob_2tail;
622
72
}
623
624// Bisection algorithm to find the inverse t-distribution (Critical t-value)
625
20
static double qt_tail(double df, double p_tail) {
626
8068
        double low = 0.0, high = 1.0;
627
8056
        // Find upper bound
628        while (get_t_pvalue(high, df, "greater") > p_tail) {
629
8056
          low = high;
630          high *= 2.0;
631
8056
          if (high > 1000000.0) break; /* Fallback limit */
632        }
633
8056
        // Bisect to find the root
634
12
        for (unsigned short int i = 0; i < 100; i++) {
635          double mid = (low + high) / 2.0;
636          double p_mid = get_t_pvalue(mid, df, "greater");
637          if (p_mid > p_tail) {
638                   low = mid;
639
8092
          } else {
640
36
                   high = mid;
641          }
642          if (high - low < 1e-8) break;
643
8056
        }
644
0
        return (low + high) / 2.0;
645}
646
647int compare_doubles(const void *restrict a, const void *restrict b) {
648
8
        double da = *(const double*restrict)a;
649        double db = *(const double*restrict)b;
650
8
        return (da > db) - (da < db);
651}
652/* Helper to calculate the number of bins using Sturges' formula: log2(n) + 1 */
653
92
static size_t calculate_sturges_bins(size_t n) {
654
72
        if (n == 0) return 1;
655
72
        return (size_t)(log((double)n) / log(2.0) + 1.0);
656
64
}
657
658
8
// Logic for distributing data into bins (Optimized to O(N))
659static void compute_hist_logic(double *restrict x, size_t n, double *restrict breaks, size_t n_bins,
660                               size_t *restrict counts, double *restrict mids, double *restrict density) {
661
20
        double total_n = (double)n;
662        double min_val = breaks[0];
663        double step = (n_bins > 0) ? (breaks[1] - breaks[0]) : 0.0;
664
224
        // Initialize counts and compute midpoints
665
224
        for (size_t i = 0; i < n_bins; i++) {
666          counts[i] = 0;
667          mids[i] = (breaks[i] + breaks[i+1]) / 2.0;
668        }
669        // Single O(N) pass to assign elements to bins
670        if (step > 0.0) {
671          for (size_t j = 0; j < n; j++) {
672                   double val = x[j];
673
112
                   // Ignore out-of-bounds or invalid values
674
112
                   if (isnan(val) || isinf(val) || val < min_val) continue;
675
112
                   // Calculate initial bin index mathematically
676
112
                   size_t idx = (size_t)((val - min_val) / step);
677                   // Clamp to valid array bounds first to prevent overflow */
678                   if (idx >= n_bins) {
679                       idx = n_bins - 1;
680
112
                   }
681
112
                   /* Adjust for exact boundaries (R's right-inclusive default: (a, b]) */
682
88
                   /* If value is exactly on or slightly below the lower boundary of the assigned bin,
683
88
                      it belongs in the previous bin. (First bin [a, b] is inclusive on both ends) */
684
88
                   while (idx > 0 && val <= breaks[idx]) {
685                       idx--;
686
24
                   }
687
24
                   // Conversely, if floating-point truncation placed it too low, push it up
688
24
                   while (idx < n_bins - 1 && val > breaks[idx + 1]) {
689
24
                       idx++;
690
24
                   }
691
24
                   counts[idx]++;
692          }
693
112
        } else if (n_bins > 0) {
694          // Edge case: All data points have the exact same value (step == 0)
695          counts[0] = n;
696        }
697        // Compute densities
698        for (size_t i = 0; i < n_bins; i++) {
699          double bin_width = breaks[i+1] - breaks[i];
700          if (bin_width > 0) {
701                   density[i] = (double)counts[i] / (total_n * bin_width);
702          } else {
703                   density[i] = (n_bins == 1) ? 1.0 : 0.0;
704          }
705        }
706
4
}
707
708
4
// Standard Normal CDF approximation
709
24
double approx_pnorm(double x) {
710        return 0.5 * erfc(-x * 0.70710678118654752440); // 0.707... = 1/sqrt(2)
711
4
}
712#ifndef M_SQRT1_2
713#define M_SQRT1_2 0.70710678118654752440
714#endif
715
716/* Macro for exact Wilcoxon 3D array indexing */
717#define DP_INDEX(i, j, k, n2, max_u) ((i) * ((n2) + 1) * ((max_u) + 1) + (j) * ((max_u) + 1) + (k))
718static double inverse_normal_cdf(double p) {
719        double a[4] = {2.50662823884, -18.61500062529, 41.39119773534, -25.44106049637};
720        double b[4] = {-8.47351093090, 23.08336743743, -21.06224101826, 3.13082909833};
721        double c[9] = {0.3374754822726147, 0.9761690190917186, 0.1607979714918209,
722                          0.0276438810333863, 0.0038405729373609, 0.0003951896511919,
723                          0.0000321767881768, 0.0000002888167364, 0.0000003960315187};
724
24
        double x, r, y;
725        y = p - 0.5;
726
4
        if (fabs(y) < 0.42) {
727
824
          r = y * y;
728
820
          x = y * (((a[3]*r + a[2])*r + a[1])*r + a[0]) /
729                       ((((b[3]*r + b[2])*r + b[1])*r + b[0])*r + 1.0);
730
476
        } else {
731
176
          r = p;
732          if (y > 0) r = 1.0 - p;
733
300
          r = log(-log(r));
734          x = c[0] + r * (c[1] + r * (c[2] + r * (c[3] + r * (c[4] +
735
2856
                   r * (c[5] + r * (c[6] + r * (c[7] + r * c[8])))))));
736
476
          if (y < 0) x = -x;
737
476
        }
738        return x;
739
344
}
740
344
/* -----------------------------------------------------------------------
741 * Exact Spearman p-value via exhaustive permutation enumeration.
742 *
743 * Under H0, all n! orderings of ranks are equally probable.  We visit
744 * every permutation of {1..n} with Heap's algorithm (O(n!), no allocs
745
4
* inside the loop) and count how many yield S ≤ s_obs ("lower tail",
746 * i.e. rho ≥ rho_obs) and how many yield S ≥ s_obs ("upper tail").
747 *
748
4
* Mirrors R's default: exact = (n < 10) with no ties.
749
4
* Valid up to n = 9 (362 880 iterations — negligible cost).
750 * ----------------------------------------------------------------------- */
751
4
static double spearman_exact_pvalue(double s_obs, size_t n, const char *restrict alt) {
752
4
        int *restrict perm = (int*)safemalloc(n * sizeof(int));
753        int *restrict c    = (int*)safemalloc(n * sizeof(int));
754
4
        for (size_t i = 0; i < n; i++) { perm[i] = i + 1; c[i] = 0; }
755
756        long count_le = 0, count_ge = 0, total = 0;
757
758        #define TALLY_PERM() do {                                    \
759          double s_ = 0.0;                                     \
760          for (int ii = 0; ii < n; ii++) {                    \
761
8
                   double d_ = (double)(ii + 1) - (double)perm[ii];\
762
8
                   s_ += d_ * d_;                                   \
763
8
          }                                                    \
764
96
          if (s_ <= s_obs + 1e-9) count_le++;                 \
765
8
          if (s_ >= s_obs - 1e-9) count_ge++;                 \
766          total++;                                             \
767
40
        } while (0)
768
769
384
        TALLY_PERM();   /* initial permutation [1, 2, ..., n] */
770
771
224
        unsigned int k = 1;
772
192
        while (k < n) {
773
824
          if (c[k] < k) {
774
632
                   int tmp;
775                   if (k % 2 == 0) {
776                       tmp = perm[0]; perm[0] = perm[k]; perm[k] = tmp;
777
192
                   } else {
778                       tmp = perm[c[k]]; perm[c[k]] = perm[k]; perm[k] = tmp;
779
32
                   }
780
32
                   TALLY_PERM();
781                   c[k]++;
782                   k = 1;
783
8
          } else {
784
8
                   c[k] = 0;
785
8
                   k++;
786
8
          }
787
80
        }
788
8
        #undef TALLY_PERM
789
790
8
        Safefree(perm); Safefree(c);
791
8
        /* p_le = P(S ≤ s_obs) ≡ P(rho ≥ rho_obs)  â€” upper rho tail
792
8
        * p_ge = P(S ≥ s_obs) ≡ P(rho ≤ rho_obs)  â€” lower rho tail  */
793        double p_le = (double)count_le / (double)total;
794
4
        double p_ge = (double)count_ge / (double)total;
795
796        if (strcmp(alt, "greater") == 0) return p_le;
797        if (strcmp(alt, "less")    == 0) return p_ge;
798
378
        /* two.sided: 2 × the smaller tail, clamped to 1 */
799
378
        double p = 2.0 * (p_le < p_ge ? p_le : p_ge);
800
378
        return (p > 1.0) ? 1.0 : p;
801
378
}
802/* -----------------------------------------------------------------------
803 * Exact Kendall p-value via Mahonian Numbers (Inversions distribution)
804 * Matches R's behavior for N < 50 without ties.
805 * ----------------------------------------------------------------------- */
806
24
static double kendall_exact_pvalue(size_t n, double s_obs, const char *restrict alt) {
807
24
        long max_inv = (long)n * (n - 1) / 2;
808
96
        double *restrict dp = (double*)safemalloc((max_inv + 1) * sizeof(double));
809
72
        for (long i = 0; i <= max_inv; i++) dp[i] = 0.0;
810
72
        dp[0] = 1.0;
811
0
        /* Build the distribution of inversions via DP */
812
0
        for (size_t i = 2; i <= n; i++) {
813          double *restrict next_dp = (double*)safemalloc((max_inv + 1) * sizeof(double));
814          for (long k = 0; k <= max_inv; k++) next_dp[k] = 0.0;
815
72
          int current_max_inv = i * (i - 1) / 2;
816
652
          for (int k = 0; k <= current_max_inv; k++) {
817
580
                   double sum = 0;
818                   for (int j = 0; j <= i - 1 && k - j >= 0; j++) {
819
72
                       sum += dp[k - j];
820
4
                   }
821
4
                   // Divide by 'i' directly to keep array as pure probabilities and prevent overflow
822                   next_dp[k] = sum / (double)i;
823          }
824
68
          Safefree(dp);
825
636
          dp = next_dp;
826
568
        }
827
568
        // Convert S statistic to target number of inversions
828        long i_obs = (long)round((max_inv - s_obs) / 2.0);
829
68
        if (i_obs < 0) i_obs = 0;
830
68
        if (i_obs > max_inv) i_obs = max_inv;
831
68
        double p_le = 0.0; /* P(S <= S_obs) */
832
68
        for (long k = i_obs; k <= max_inv; k++) p_le += dp[k];
833        double p_ge = 0.0; /* P(S >= S_obs) */
834
144
        for (long k = 0; k <= i_obs; k++) p_ge += dp[k];
835
76
        Safefree(dp);
836
760
        if (strcmp(alt, "greater") == 0) return p_ge;
837
76
        if (strcmp(alt, "less") == 0) return p_le;
838
76
        // two.sided
839
760
        double p = 2.0 * (p_ge < p_le ? p_ge : p_le);
840        return p > 1.0 ? 1.0 : p;
841}
842// F-distribution Cumulative Distribution Function P(F <= f)
843
68
static double pf(double f, double df1, double df2) {
844
568
        if (f <= 0.0) return 0.0;
845
68
        double x = (df1 * f) / (df1 * f + df2);
846
68
        return incbeta(df1 / 2.0, df2 / 2.0, x);
847
568
}
848
849
68
/* Householder QR Decomposition for Sequential Sums of Squares */
850
68
/* Householder QR Decomposition for Sequential Sums of Squares */
851static void apply_householder_aov(double** restrict X, double* restrict y, size_t n, size_t p, bool* restrict aliased, size_t* restrict rank_map) {
852
24
        size_t r = 0; // Rank/Row tracker
853        for (size_t k = 0; k < p; k++) {
854                aliased[k] = FALSE;
855                if (r >= n) {
856                        aliased[k] = TRUE;
857
97
                        continue;
858
97
                }
859
860                double max_val = 0;
861                for (size_t i = r; i < n; i++) {
862
18
                        if (fabs(X[i][k]) > max_val) max_val = fabs(X[i][k]);
863
18
                }
864                if (max_val < 1e-10) {
865
18
                        aliased[k] = TRUE;
866
36
                        continue;
867
18
                } // Collinear or zero column
868
869
18
                double norm = 0;
870                for (size_t i = r; i < n; i++) {
871                        X[i][k] /= max_val;
872                        norm += X[i][k] * X[i][k];
873
720
                }
874
720
                norm = sqrt(norm);
875
720
                double s = (X[r][k] > 0) ? -norm : norm;
876
720
                double u1 = X[r][k] - s;
877
44
                X[r][k] = s * max_val;
878
879                for (size_t j = k + 1; j < p; j++) {
880
720
                        double dot = u1 * X[r][j];
881
44
                        for (size_t i = r + 1; i < n; i++) dot += X[i][j] * X[i][k];
882
472
                        double tau = dot / (s * u1);
883
428
                        X[r][j] += tau * u1;
884
28
                        for (size_t i = r + 1; i < n; i++) X[i][j] += tau * X[i][k];
885
28
                }
886
887
400
                // Transform the response vector y
888                double dot_y = u1 * y[r];
889                for (size_t i = r + 1; i < n; i++) dot_y += y[i] * X[i][k];
890
44
                double tau_y = dot_y / (s * u1);
891                y[r] += tau_y * u1;
892
676
                for (size_t i = r + 1; i < n; i++) y[i] += tau_y * X[i][k];
893
894
720
                rank_map[k] = r; // Map original column index to orthogonal row index
895                r++;
896        }
897
194
}
898
899
914
// --- write_table Helpers ---
900
901
720
// Sorts string arrays alphabetically
902
720
static int cmp_string_wt(const void *a, const void *b) {
903        return strcmp(*(const char**)a, *(const char**)b);
904
0
}
905
906// Emulates Perl's /\D/ check
907
194
static bool contains_nondigit(SV *restrict sv) {
908
194
        if (!sv || !SvOK(sv)) return 0;
909        STRLEN len;
910        const char *restrict s = SvPVbyte(sv, len);
911
23
        for (size_t i = 0; i < len; i++) {
912
23
          if (!isdigit(s[i])) return 1;
913
23
        }
914        return 0;
915}
916
917
11
// Writes a properly quoted string dynamically
918
11
static void print_str_quoted(PerlIO *fh, const char *str, const char *sep) {
919
11
        if (!str) str = "";
920
154
        bool needs_quotes = 0;
921
143
        if (strstr(str, sep) != NULL || strchr(str, '"') != NULL || strchr(str, '\r') != NULL || strchr(str, '\n') != NULL) {
922
143
          needs_quotes = 1;
923
143
        }
924
925
11
        if (needs_quotes) {
926          PerlIO_putc(fh, '"');
927          for (const char *restrict p = str; *p; p++) {
928                   if (*p == '"') {
929
12
                       PerlIO_putc(fh, '"');
930
12
                       PerlIO_putc(fh, '"');
931
12
                   } else {
932
12
                       PerlIO_putc(fh, *p);
933
188
                   }
934
188
          }
935
188
          PerlIO_putc(fh, '"');
936
188
        } else {
937
188
          PerlIO_puts(fh, str);
938
188
        }
939
188
}
940
941
188
// Writes an array of strings joined by sep
942
188
static void print_string_row(PerlIO *fh, const char **row, size_t len, const char *sep) {
943
188
        size_t sep_len = strlen(sep);
944
176
        for (size_t i = 0; i < len; i++) {
945          if (i > 0) PerlIO_write(fh, sep, sep_len);
946
12
          if (row[i]) {
947                   print_str_quoted(fh, row[i], sep);
948          } else {
949                   print_str_quoted(fh, "", sep);
950
23
          }
951
23
        }
952
23
        PerlIO_putc(fh, '\n');
953
23
}
954// Calculates the Regularized Upper Incomplete Gamma Function Q(a, x)
955// This perfectly replicates R's pchisq(..., lower.tail=FALSE)
956double igamc(double a, double x) {
957        if (x < 0.0 || a <= 0.0) return 1.0;
958        if (x == 0.0) return 1.0;
959
960        // Series expansion for x < a + 1
961        if (x < a + 1.0) {
962
8
                double sum = 1.0 / a;
963
8
                double term = 1.0 / a;
964
8
                double n = 1.0;
965
8
                while (fabs(term) > 1e-15) {
966
32
                        term *= x / (a + n);
967
24
                        sum += term;
968                        n += 1.0;
969
8
                }
970                return 1.0 - (sum * exp(-x + a * log(x) - lgamma(a)));
971        }
972
973        // Continued fraction for x >= a + 1
974
16
        double b = x + 1.0 - a;
975
16
        double c = 1.0 / 1e-30;
976
16
        double d = 1.0 / b;
977
16
        double h = d, i = 1.0;
978
8
        while (i < 10000) { // Safety bound
979                double an = -i * (i - a);
980
8
                b += 2.0;
981
8
                d = an * d + b;
982                if (fabs(d) < 1e-30) d = 1e-30;
983
32
                c = b + an / c;
984
216
                if (fabs(c) < 1e-30) c = 1e-30;
985
144
                d = 1.0 / d;
986                double del = d * c;
987                h *= del;
988
8
                if (fabs(del - 1.0) < 1e-15) break;
989
16
                i += 1.0;
990        }
991
8
        return h * exp(-x + a * log(x) - lgamma(a));
992
8
}
993
994
8
// Chi-Squared p-value is simply the Incomplete Gamma of (df/2, stat/2)
995
8
double get_p_value(double stat, int df) {
996        if (df <= 0) return 1.0;
997        if (stat <= 0.0) return 1.0;
998        return igamc((double)df / 2.0, stat / 2.0);
999}
1000
1001
24
/* --- C HELPER SECTION --- */
1002
24
#ifndef M_SQRT1_2
1003
24
#define M_SQRT1_2 0.70710678118654752440
1004
24
#endif
1005
1006
20
/* Robust Binomial Coefficient using long double */
1007
20
static long double choose_comb(int n, int k) {
1008        if (k < 0 || k > n) return 0.0L;
1009
184
        if (k > n / 2) k = n - k;
1010
6328
        long double res = 1.0L;
1011        for (int i = 1; i <= k; i++) {
1012          res = res * (long double)(n - i + 1) / (long double)i;
1013
20
        }
1014
728
        return res;
1015}
1016
1017
20
/* Exact CDF for Mann-Whitney U: P(U <= q)
1018   Mathematically identical to R's cwilcox generating function */
1019
20
static double exact_pwilcox(double q, int m, int n) {
1020
20
    int k = (int)floor(q + 1e-7); // R uses 1e-7 fuzz
1021    int max_u = m * n;
1022    if (k < 0) return 0.0;
1023
1158
    if (k >= max_u) return 1.0;
1024
1025
1158
    long double *restrict w = (long double *)safecalloc(max_u + 1, sizeof(long double));
1026
1158
    w[0] = 1.0L;
1027
1028    for (int j = 1; j <= n; j++) {
1029
43
        for (int i = j; i <= max_u; i++) w[i] += w[i - j];
1030
43
        for (int i = max_u; i >= j + m; i--) w[i] -= w[i - j - m];
1031
43
    }
1032
1033
43
    long double cum_p = 0.0L;
1034
43
    for (int i = 0; i <= k; i++) cum_p += w[i];
1035
1036
438
    long double total = choose_comb(m + n, n);
1037
470
    double result = (double)(cum_p / total);
1038
1039
908
    Safefree(w);
1040
438
    return result;
1041
438
}
1042
1043/* Exact CDF for Wilcoxon Signed Rank: P(V <= q)
1044
43
   Mathematically identical to R's csignrank subset-sum DP */
1045static double exact_psignrank(double q, int n) {
1046        int k = (int)floor(q + 1e-7);
1047        int max_v = n * (n + 1) / 2;
1048        if (k < 0) return 0.0;
1049        if (k >= max_v) return 1.0;
1050
1051        long double *restrict w = (long double *)safecalloc(max_v + 1, sizeof(long double));
1052        w[0] = 1.0L;
1053
1054        for (int i = 1; i <= n; i++) {
1055          for (int j = max_v; j >= i; j--) w[j] += w[j - i];
1056        }
1057
1058
156
        long double cum_p = 0.0L;
1059
156
        for (int i = 0; i <= k; i++) cum_p += w[i];
1060
1061
156
        long double total = powl(2.0L, (long double)n);
1062
1752
        double result = (double)(cum_p / total);
1063
1064        Safefree(w);
1065        return result;
1066}
1067
1068static int cmp_rank_info(const void *a, const void *b) {
1069        double da = ((const RankInfo*)a)->val;
1070
0
        double db = ((const RankInfo*)b)->val;
1071
0
        return (da > db) - (da < db);
1072
0
}
1073
1074
0
static double rank_and_count_ties(RankInfo *restrict ri, size_t n, bool *restrict has_ties) {
1075
0
        if (n == 0) return 0.0;
1076
0
        qsort(ri, n, sizeof(RankInfo), cmp_rank_info);
1077
0
        size_t i = 0;
1078
0
        double tie_adj = 0.0;
1079
0
        *has_ties = 0;
1080        while (i < n) {
1081
0
                size_t j = i + 1;
1082
0
                while (j < n && ri[j].val == ri[i].val) j++;
1083                double r = (double)(i + 1 + j) / 2.0;
1084                for (size_t k = i; k < j; k++) ri[k].rank = r;
1085
0
                size_t t = j - i;
1086
0
                if (t > 1) { *has_ties = 1; tie_adj += ((double)t * t * t - t); }
1087
0
                i = j;
1088
0
        }
1089        return tie_adj;
1090
0
}
1091/* --- KS-TEST C HELPER SECTION --- */
1092
0
#ifndef M_PI_2
1093
0
#define M_PI_2 1.57079632679489661923
1094
0
#endif
1095
0
#ifndef M_PI_4
1096
0
#define M_PI_4 0.78539816339744830962
1097#endif
1098
0
#ifndef M_1_SQRT_2PI
1099#define M_1_SQRT_2PI 0.39894228040143267794
1100
0
#endif
1101
1102// Scalar integer power used by K2x
1103static double r_pow_di(double x, int n) {
1104
28
        if (n == 0) return 1.0;
1105
560
        if (n < 0) return 1.0 / r_pow_di(x, -n);
1106
10640
        double val = 1.0;
1107
10108
        for (int i = 0; i < n; i++) val *= x;
1108
202160
        return val;
1109
10108
}
1110
1111// Two-sample two-sided asymptotic distribution
1112
28
static double K2l(double x, int lower, double tol) {
1113        double s, z, p;
1114
24
        int k;
1115
24
        if(x <= 0.) {
1116
1448
          if(lower) p = 0.;
1117
4
          else p = 1.;
1118
4
        } else if(x < 1.) {
1119          int k_max = (int) sqrt(2.0 - log(tol));
1120
20
          double w = log(x);
1121
20
          z = - (M_PI_2 * M_PI_4) / (x * x);
1122
20
          s = 0;
1123
20
          for(k = 1; k < k_max; k += 2) {
1124
20
                   s += exp(k * k * z - w);
1125
4344
          }
1126
12
          p = s / M_1_SQRT_2PI;
1127          if(!lower) p = 1.0 - p;
1128
8
        } else {
1129
8
          double new_val, old_val;
1130          z = -2.0 * x * x;
1131
20
          s = -1.0;
1132
0
          if(lower) {
1133
0
                   k = 1; old_val = 0.0; new_val = 1.0;
1134          } else {
1135
20
                   k = 2; old_val = 0.0; new_val = 2.0 * exp(z);
1136          }
1137          while(fabs(old_val - new_val) > tol) {
1138                   old_val = new_val;
1139
4
                   new_val += 2.0 * s * exp(z * k * k);
1140
4
                   s *= -1.0;
1141
4
                   k++;
1142
4
          }
1143
4
          p = new_val;
1144
4
        }
1145        return p;
1146
80
}
1147
1148
1444
// Auxiliary routines used by K2x() for matrix operations
1149
832
static void m_multiply(double *A, double *B, double *C, unsigned int m) {
1150        for(unsigned int i = 0; i < m; i++) {
1151          for(unsigned int j = 0; j < m; j++) {
1152
80
                   double s = 0.;
1153
76
                   for(unsigned int k = 0; k < m; k++) s += A[i * m + k] * B[k * m + j];
1154
76
                   C[i * m + j] = s;
1155          }
1156
4
        }
1157}
1158
1159
1520
static void m_power(double *A, int eA, double *V, int *eV, int m, int n) {
1160
1444
    if(n == 1) {
1161
6080
        for(int i = 0; i < m * m; i++) V[i] = A[i];
1162        *eV = eA;
1163        return;
1164    }
1165    m_power(A, eA, V, eV, m, n / 2);
1166
4
    double *restrict B = (double*) safecalloc(m * m, sizeof(double));
1167
4
    m_multiply(V, V, B, m);
1168
4
    int eB = 2 * (*eV);
1169    if((n % 2) == 0) {
1170
204
        for(int i = 0; i < m * m; i++) V[i] = B[i];
1171
200
        *eV = eB;
1172
200
    } else {
1173
0
        m_multiply(A, B, V, m);
1174
0
        *eV = eA + eB;
1175    }
1176    if(V[(m / 2) * m + (m / 2)] > 1e140) {
1177
4
        for(int i = 0; i < m * m; i++) V[i] = V[i] * 1e-140;
1178
4
        *eV += 140;
1179
4
    }
1180
4
    Safefree(B);
1181}
1182
1183// One-sample two-sided exact distribution
1184
12
static double K2x(int n, double d) {
1185        int k = (int) (n * d) + 1;
1186
12
        int m = 2 * k - 1;
1187
12
        double h = k - n * d;
1188
12
        double *restrict H = (double*) safecalloc(m * m, sizeof(double));
1189
12
        double *restrict Q = (double*) safecalloc(m * m, sizeof(double));
1190
1191
972
        for(int i = 0; i < m; i++) {
1192          for(int j = 0; j < m; j++) {
1193
960
                   if(i - j + 1 < 0) H[i * m + j] = 0;
1194
156
                   else H[i * m + j] = 1;
1195
0
          }
1196        }
1197
1560
        for(int i = 0; i < m; i++) {
1198
1320
          H[i * m] -= r_pow_di(h, i + 1);
1199          H[(m - 1) * m + i] -= r_pow_di(h, (m - i));
1200
960
        }
1201
960
        H[(m - 1) * m] += ((2 * h - 1 > 0) ? r_pow_di(2 * h - 1, m) : 0);
1202
1203        for(int i = 0; i < m; i++) {
1204
960
          for(int j = 0; j < m; j++) {
1205
960
                   if(i - j + 1 > 0) {
1206
960
                       for(int g = 1; g <= i - j + 1; g++) H[i * m + j] /= g;
1207                   }
1208
12
          }
1209
12
        }
1210
1211
12
        int eH = 0, eQ;
1212        m_power(H, eH, Q, &eQ, m, n);
1213        double s = Q[(k - 1) * m + k - 1];
1214
1215
18960
        for(int i = 1; i <= n; i++) {
1216
12640
          s = s * (double)i / (double)n;
1217          if(s < 1e-140) {
1218                   s *= 1e140;
1219                   eQ -= 140;
1220
12
          }
1221
12
        }
1222
12
        s *= pow(10.0, eQ);
1223
12
        Safefree(H);
1224        Safefree(Q);
1225
372
        return s;
1226
360
}
1227
1228// Calculate D (two-sided), D+ (greater), and D- (less) simultaneously
1229
612
static void calc_2sample_stats(double *x, size_t nx, double *y, size_t ny,
1230
600
                               double *d, double *d_plus, double *d_minus) {
1231
18600
        qsort(x, nx, sizeof(double), compare_doubles);
1232
18000
        qsort(y, ny, sizeof(double), compare_doubles);
1233        double max_d = 0.0, max_d_plus = 0.0, max_d_minus = 0.0;
1234
13336
        size_t i = 0, j = 0;
1235
1236
13336
        while(i < nx || j < ny) {
1237          double val;
1238          if (i < nx && j < ny) val = (x[i] < y[j]) ? x[i] : y[j];
1239          else if (i < nx) val = x[i];
1240
12
          else val = y[j];
1241
1242
12
          while(i < nx && x[i] <= val) i++;
1243          while(j < ny && y[j] <= val) j++;
1244
1245
916
          double cdf1 = (double)i / nx;
1246
916
          double cdf2 = (double)j / ny;
1247
916
          double diff = cdf1 - cdf2;
1248
1249          if (diff > max_d_plus) max_d_plus = diff;
1250
916
          if (-diff > max_d_minus) max_d_minus = -diff;
1251
916
          if (fabs(diff) > max_d) max_d = fabs(diff);
1252        }
1253
916
        *d = max_d;
1254        *d_plus = max_d_plus;
1255
916
        *d_minus = max_d_minus;
1256}
1257
1258// Branch the DP boundary check based on the 'alternative'
1259static int psmirnov_exact_test(double q, double r, double s, int two_sided) {
1260        if (two_sided) return (fabs(r - s) >= q);
1261
916
        return ((r - s) >= q); // Used for both D+ and D- via symmetry
1262}
1263
1264// Evaluate the exact 2-sample probability
1265static double psmirnov_exact_uniq_upper(double q, int m, int n, int two_sided) {
1266        double md = (double) m, nd = (double) n;
1267        double *restrict u = (double *) safecalloc(n + 1, sizeof(double));
1268        u[0] = 0.;
1269
1270
18
        for(unsigned int j = 1; j <= n; j++) {
1271
18
          if(psmirnov_exact_test(q, 0., j / nd, two_sided)) u[j] = 1.;
1272
18
          else u[j] = u[j - 1];
1273
18
        }
1274        for(unsigned int i = 1; i <= m; i++) {
1275          if(psmirnov_exact_test(q, i / md, 0., two_sided)) u[0] = 1.;
1276
32
          for(int j = 1; j <= n; j++) {
1277
26
                   if(psmirnov_exact_test(q, i / md, j / nd, two_sided)) u[j] = 1.;
1278
26
                   else {
1279                       double v = (double)(i) / (double)(i + j);
1280                       double w = (double)(j) / (double)(i + j);
1281
26
                       u[j] = v * u[j] + w * u[j - 1];
1282
263
                   }
1283
260
          }
1284
260
        }
1285
254
        double res = u[n];
1286
125
        Safefree(u);
1287
132
        return res;
1288}
1289
1290static double p_body(double n, double delta, double sd, double sig_level, int tsample, int tside, bool strict) {
1291        double nu = (n - 1.0) * (double)tsample;
1292
269
        if (nu < 1e-7) nu = 1e-7;
1293
1294
10
        // Ensure sig_level/tside is not truncated
1295
10
        double p_tail = sig_level / (double)tside;
1296
10
        double qu = qt_tail(nu, p_tail); // qt(p, df, lower.tail=FALSE)
1297
1298
4
        double ncp = sqrt(n / (double)tsample) * (delta / sd);
1299
1300        if (strict && tside == 2) {
1301
10
          // Use R-style tail calls: 1 - P(T < qu) + P(T < -qu)
1302
4
          return (1.0 - exact_pnt(qu, nu, ncp)) + exact_pnt(-qu, nu, ncp);
1303        } else {
1304          // Default: 1 - P(T < qu)
1305
16
          // Ensure exact_pnt is using a convergence tolerance of at least 1e-15
1306
3
          return 1.0 - exact_pnt(qu, nu, ncp);
1307        }
1308}
1309
1310
13
// Bisection algorithm to find the inverse F-distribution (Quantile function)
1311
13
// Equivalent to R's qf(p, df1, df2)
1312static double qf_bisection(double p, double df1, double df2) {
1313
13
        if (p <= 0.0) return 0.0;
1314
6
        if (p >= 1.0) return INFINITY;
1315        double low = 0.0, high = 1.0;
1316        // Find upper bound
1317
14
        while (pf(high, df1, df2) < p) {
1318
14
          low = high;
1319
14
          high *= 2.0;
1320          if (high > 1e100) break; /* Fallback limit */
1321        }
1322
1323
14
        // Bisect to find the root
1324
612
        for (unsigned short int i = 0; i < 150; i++) {
1325
600
                double mid = low + (high - low) / 2.0;
1326
602
                double p_mid = pf(mid, df1, df2);
1327
1328                if (p_mid < p) {
1329                        low = mid;
1330                } else {
1331
16
                        high = mid;
1332
12
                }
1333                if (high - low < 1e-12) break;
1334        }
1335
25
        return (low + high) / 2.0;
1336
13
}
1337
13
// --- XS SECTION ---
1338MODULE = Stats::LikeR  PACKAGE = Stats::LikeR
1339
1340
9
SV* ks_test(...)
1341
283
CODE:
1342
274
{
1343
274
        SV *restrict x_sv = NULL, *restrict y_sv = NULL;
1344
274
        short int exact = -1;
1345        const char *restrict alternative = "two.sided";
1346        int arg_idx = 0;
1347
1348
13
        // Shift arrays if provided positionally
1349
204
        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1350
200
                x_sv = ST(arg_idx);
1351                arg_idx++;
1352        }
1353        // Check if second argument is an array (2-sample) or a string representing a CDF (1-sample)
1354
209
        if (arg_idx < items) {
1355                if (SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1356                        y_sv = ST(arg_idx);
1357
209
                        arg_idx++;
1358
10
                } else if (SvPOK(ST(arg_idx))) {
1359
7
                        y_sv = ST(arg_idx); // Save string (e.g., "pnorm") for 1-sample test logic
1360                        arg_idx++;
1361                }
1362
16
        }
1363
1364
12
        // Parse named arguments
1365
12
        for (; arg_idx < items; arg_idx += 2) {
1366          const char *restrict key = SvPV_nolen(ST(arg_idx));
1367          SV *restrict val = ST(arg_idx + 1);
1368
12
          if      (strEQ(key, "x"))           x_sv = val;
1369
102
          else if (strEQ(key, "y"))           y_sv = val;
1370
549
          else if (strEQ(key, "exact"))       {
1371
369
                   if (!SvOK(val)) exact = -1;
1372
99
                   else exact = SvTRUE(val) ? 1 : 0;
1373          }
1374
12
          else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
1375
720
          else croak("ks_test: unknown argument '%s'", key);
1376
711
        }
1377
1378
12
        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) {
1379
12
          croak("ks_test: 'x' is a required argument and must be an ARRAY reference");
1380
2
        }
1381
1382        bool is_two_sided = strEQ(alternative, "two.sided") ? 1 : 0;
1383
12
        bool is_greater   = strEQ(alternative, "greater") ? 1 : 0;
1384
12
        bool is_less      = strEQ(alternative, "less") ? 1 : 0;
1385
1386
12
        if (!is_two_sided && !is_greater && !is_less) {
1387          croak("ks_test: alternative must be 'two.sided', 'less', or 'greater'");
1388
3
        }
1389
1390
153
        AV *restrict x_av = (AV*)SvRV(x_sv);
1391
93
        size_t nx = av_len(x_av) + 1;
1392        if (nx == 0) croak("Not enough 'x' observations");
1393
1394        // Extract 'x' array to C-array
1395        double *restrict x_data = (double *)safemalloc(nx * sizeof(double));
1396
12
        size_t valid_nx = 0;
1397        for (size_t i = 0; i < nx; i++) {
1398          SV**restrict el = av_fetch(x_av, i, 0);
1399
246
          if (el && SvOK(*el) && looks_like_number(*el)) {
1400
240
                   x_data[valid_nx++] = SvNV(*el);
1401
6
          }
1402
6
        }
1403
1404
153
        double statistic = 0.0, p_value = 0.0;
1405
153
        const char *restrict method_desc = "";
1406
1407
153
        // --- TWO SAMPLE ---
1408        if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV) {
1409
153
          AV *restrict y_av = (AV*)SvRV(y_sv);
1410
150
          size_t ny = av_len(y_av) + 1;
1411
1412
150
          double *restrict y_data = (double *)safemalloc(ny * sizeof(double));
1413
150
          size_t valid_ny = 0;
1414
150
          for (size_t i = 0; i < ny; i++) {
1415
150
                   SV**restrict el = av_fetch(y_av, i, 0);
1416                   if (el && SvOK(*el) && looks_like_number(*el)) {
1417
153
                       y_data[valid_ny++] = SvNV(*el);
1418
152
                   }
1419          }
1420
1421
4
          if (valid_nx < 1 || valid_ny < 1) {
1422
4
                   Safefree(x_data); Safefree(y_data);
1423
4
                   croak("Not enough non-missing observations for KS test");
1424
54
          }
1425
1426
53
          double d, d_plus, d_minus;
1427
53
          calc_2sample_stats(x_data, valid_nx, y_data, valid_ny, &d, &d_plus, &d_minus);
1428
1429
50
          // Map alternative to the correct statistic
1430
50
          if (is_greater) statistic = d_plus;
1431
50
          else if (is_less) statistic = d_minus;
1432          else statistic = d;
1433
1434
50
          // Determine if exact or asymptotic
1435
50
          bool use_exact = FALSE;
1436
50
          if (exact == 1) use_exact = TRUE;
1437
50
          else if (exact == 0) use_exact = FALSE;
1438          else use_exact = (valid_nx * valid_ny < 10000);
1439
1440
50
          // Check for ties in combined set
1441
1
          size_t total_n = valid_nx + valid_ny;
1442          double *restrict comb = (double *)safemalloc(total_n * sizeof(double));
1443          for(size_t i=0; i<valid_nx; i++) comb[i] = x_data[i];
1444
1
          for(size_t i=0; i<valid_ny; i++) comb[valid_nx+i] = y_data[i];
1445
1
          qsort(comb, total_n, sizeof(double), compare_doubles);
1446
1447
13
          bool has_ties = FALSE;
1448
13
          for(size_t i = 1; i < total_n; i++) {
1449
13
                   if(comb[i] == comb[i-1]) { has_ties = TRUE; break; }
1450
13
          }
1451
13
          Safefree(comb);
1452
12
          if (use_exact && has_ties) {
1453
12
                   warn("cannot compute exact p-value with ties; falling back to asymptotic");
1454
12
                   use_exact = FALSE;
1455
12
          }
1456          if (use_exact) {
1457                   method_desc = "Two-sample Kolmogorov-Smirnov exact test";
1458                   double q = (0.5 + floor(statistic * valid_nx * valid_ny - 1e-7)) / ((double)valid_nx * valid_ny);
1459                   p_value = psmirnov_exact_uniq_upper(q, valid_nx, valid_ny, is_two_sided);
1460          } else {
1461                   method_desc = "Two-sample Kolmogorov-Smirnov test (asymptotic)";
1462                   double z = statistic * sqrt((double)(valid_nx * valid_ny) / (valid_nx + valid_ny));
1463
30
                   if (is_two_sided) {
1464
30
                       p_value = K2l(z, 0, 1e-9);
1465
30
                   } else {
1466
30
                       p_value = exp(-2.0 * z * z); // One-sided limit distribution
1467
30
                   }
1468
30
          }
1469          Safefree(y_data);
1470
30
        } else if (y_sv && SvPOK(y_sv)) {// --- ONE SAMPLE (e.g. against pnorm) ---
1471
10
          const char *restrict dist = SvPV_nolen(y_sv);
1472
10
          if (strEQ(dist, "pnorm")) {
1473                   qsort(x_data, valid_nx, sizeof(double), compare_doubles);
1474                   double max_d = 0.0, max_d_plus = 0.0, max_d_minus = 0.0;
1475
34
                   for(size_t i = 0; i < valid_nx; i++) {
1476
10
                       double cdf_obs_low  = (double)i / valid_nx;
1477
10
                       double cdf_obs_high = (double)(i + 1) / valid_nx;
1478                       double cdf_theor    = approx_pnorm(x_data[i]);
1479
1480
34
                       double diff1 = cdf_obs_low - cdf_theor;
1481
4
                       double diff2 = cdf_obs_high - cdf_theor;
1482
1483                       if (diff1 > max_d_plus) max_d_plus = diff1;
1484
94
                       if (diff2 > max_d_plus) max_d_plus = diff2;
1485
64
                       if (-diff1 > max_d_minus) max_d_minus = -diff1;
1486
70
                       if (-diff2 > max_d_minus) max_d_minus = -diff2;
1487
1488
49
                       if (fabs(diff1) > max_d) max_d = fabs(diff1);
1489
28
                       if (fabs(diff2) > max_d) max_d = fabs(diff2);
1490
19
                   }
1491
19
                   if (is_greater) statistic = max_d_plus;
1492
16
                   else if (is_less) statistic = max_d_minus;
1493
2
                   else statistic = max_d;
1494
2
                   bool use_exact = (exact == -1) ? (valid_nx < 100) : (exact == 1);
1495                   if (use_exact) {
1496
16
                       method_desc = "One-sample Kolmogorov-Smirnov exact test";
1497
2
                       if (is_two_sided) {
1498                           p_value = 1.0 - K2x(valid_nx, statistic);
1499                       } else {
1500
32
                           warn("exact 1-sample 1-sided KS test not implemented; using asymptotic");
1501
13
                           double z = statistic * sqrt((double)valid_nx);
1502
27
                           p_value = exp(-2.0 * z * z);
1503
57
                       }
1504
47
                   } else {
1505                       method_desc = "One-sample Kolmogorov-Smirnov test (asymptotic)";
1506
47
                       double z = statistic * sqrt((double)valid_nx);
1507
47
                       if (is_two_sided) p_value = K2l(z, 0, 1e-6);
1508
40
                       else p_value = exp(-2.0 * z * z);
1509
30
                   }
1510
27
          } else {
1511                    Safefree(x_data);
1512
30
                    croak("ks_test: Unsupported 1-sample distribution '%s'. Use arrays for 2-sample.", dist);
1513
29
          }
1514
27
        } else {
1515          Safefree(x_data);
1516
42
          croak("ks_test: Invalid arguments for 'y'.");
1517
17
        }
1518
15
        Safefree(x_data);
1519
109
        if (p_value > 1.0) p_value = 1.0;
1520
85
        if (p_value < 0.0) p_value = 0.0;
1521
93
        HV *restrict res = newHV();
1522
93
        hv_stores(res, "statistic", newSVnv(statistic));
1523
93
        hv_stores(res, "p_value", newSVnv(p_value));
1524
93
        hv_stores(res, "method", newSVpv(method_desc, 0));
1525        hv_stores(res, "alternative", newSVpv(alternative, 0));
1526        RETVAL = newRV_noinc((SV*)res);
1527
108
}
1528
93
OUTPUT:
1529
92
    RETVAL
1530
1531
93
SV* wilcox_test(...)
1532
93
CODE:
1533{
1534        SV *restrict x_sv = NULL, *restrict y_sv = NULL;
1535
24
        bool paired = FALSE, correct = TRUE;
1536
29
        double mu = 0.0;
1537
20
        short int exact = -1;
1538
20
        const char *restrict alternative = "two.sided";
1539
48
        int arg_idx = 0;
1540
43
        // 1. Shift first positional argument as 'x' if it's an array reference
1541
211
        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1542
43
                x_sv = ST(arg_idx);
1543                arg_idx++;
1544
43
        }
1545
43
        // 2. Shift second positional argument as 'y' if it's an array reference
1546
48
        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1547                y_sv = ST(arg_idx);
1548
43
                arg_idx++;
1549
28
        }
1550
28
        // Ensure the remaining arguments form complete key-value pairs
1551        if ((items - arg_idx) % 2 != 0) {
1552
43
                croak("Usage: wilcox_test(\\@x, [\\@y], key => value, ...)");
1553
34
        }
1554
11
        // --- Parse named arguments from the remaining flat stack ---
1555
11
        for (; arg_idx < items; arg_idx += 2) {
1556                const char *restrict key = SvPV_nolen(ST(arg_idx));
1557
11
                SV *restrict val = ST(arg_idx + 1);
1558
8
                if      (strEQ(key, "x"))           x_sv = val;
1559                else if (strEQ(key, "y"))           y_sv = val;
1560
5
                else if (strEQ(key, "paired"))      paired = SvTRUE(val);
1561
5
                else if (strEQ(key, "correct"))     correct = SvTRUE(val);
1562                else if (strEQ(key, "mu"))          mu = SvNV(val);
1563                else if (strEQ(key, "exact"))       {
1564
70
                        if (!SvOK(val)) exact = -1;
1565
14
                        else exact = SvTRUE(val) ? 1 : 0;
1566
14
                }
1567
14
                else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
1568                else croak("wilcox_test: unknown argument '%s'", key);
1569
14
        }
1570
14
        // --- Validate required / types ---
1571
9
        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
1572
0
                croak("wilcox_test: 'x' is a required argument and must be an ARRAY reference");
1573
5
        AV *restrict x_av = (AV*)SvRV(x_sv);
1574        size_t nx = av_len(x_av) + 1;
1575
11
        if (nx == 0) croak("Not enough 'x' observations");
1576
1577
11
        AV *restrict y_av = NULL;
1578
11
        size_t ny = 0;
1579
11
        if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV) {
1580                y_av = (AV*)SvRV(y_sv);
1581
16
                ny = av_len(y_av) + 1;
1582        }
1583
12
        double p_value = 0.0, statistic = 0.0;
1584
9
        const char *restrict method_desc = "";
1585
12
        bool use_exact = FALSE;
1586
12
        // --- TWO SAMPLE (Mann-Whitney) ---
1587
81
        if (ny > 0 && !paired) {
1588
72
                RankInfo *restrict ri = (RankInfo *)safemalloc((nx + ny) * sizeof(RankInfo));
1589
72
                size_t valid_nx = 0, valid_ny = 0;
1590
72
                for (size_t i = 0; i < nx; i++) {
1591                        SV**restrict el = av_fetch(x_av, i, 0);
1592
72
                        if (el && SvOK(*el) && looks_like_number(*el)) {
1593
54
                                ri[valid_nx].val = SvNV(*el) - mu; // R subtracts mu from x
1594
54
                                ri[valid_nx].idx = 1;
1595
57
                                valid_nx++;
1596
57
                        }
1597
57
                }
1598
57
                for (size_t i = 0; i < ny; i++) {
1599                        SV**restrict el = av_fetch(y_av, i, 0);
1600
20
                        if (el && SvOK(*el) && looks_like_number(*el)) {
1601
19
                                ri[valid_nx + valid_ny].val = SvNV(*el);
1602
18
                                ri[valid_nx + valid_ny].idx = 2;
1603                                valid_ny++;
1604                        }
1605
12
                }
1606
3
                if (valid_nx == 0) { Safefree(ri); croak("not enough (non-missing) 'x' observations"); }
1607
26
                if (valid_ny == 0) { Safefree(ri); croak("not enough 'y' observations"); }
1608                size_t total_n = valid_nx + valid_ny;
1609
32
                bool has_ties = 0;
1610
101
                double tie_adj = rank_and_count_ties(ri, total_n, &has_ties);
1611
92
                double w_rank_sum = 0.0;
1612
92
                for (size_t i = 0; i < total_n; i++) if (ri[i].idx == 1) w_rank_sum += ri[i].rank;
1613                statistic = w_rank_sum - (double)valid_nx * (valid_nx + 1.0) / 2.0;
1614
1615
27
                if (exact == 1) use_exact = TRUE;
1616
27
                else if (exact == 0) use_exact = FALSE;
1617
96
                else use_exact = (valid_nx < 50 && valid_ny < 50 && !has_ties);
1618
1619                if (use_exact && has_ties) {
1620
27
                        warn("cannot compute exact p-value with ties; falling back to approximation");
1621
14
                        use_exact = FALSE;
1622
14
                }
1623
14
                if (use_exact) {
1624
3
                        method_desc = "Wilcoxon rank sum exact test";
1625
0
                        double p_less = exact_pwilcox(statistic, valid_nx, valid_ny);
1626                        double p_greater = 1.0 - exact_pwilcox(statistic - 1.0, valid_nx, valid_ny);
1627
1628
3
                        if (strcmp(alternative, "less") == 0) p_value = p_less;
1629
26
                        else if (strcmp(alternative, "greater") == 0) p_value = p_greater;
1630                        else {
1631
32
                                double p = (p_less < p_greater) ? p_less : p_greater;
1632
32
                                p_value = 2.0 * p;
1633
12
                        }
1634
12
                } else {
1635                        method_desc = correct ? "Wilcoxon rank sum test with continuity correction" : "Wilcoxon rank sum test";
1636
12
                        double exp = (double)valid_nx * valid_ny / 2.0;
1637
35
                        double var = ((double)valid_nx * valid_ny / 12.0) * ((total_n + 1.0) - tie_adj / (total_n * (total_n - 1.0)));
1638                        double z = statistic - exp;
1639
1640
12
                        double CORRECTION = 0.0;
1641                        if (correct) {
1642                                if (strcmp(alternative, "two.sided") == 0) CORRECTION = (z > 0 ? 0.5 : -0.5);
1643
3
                                else if (strcmp(alternative, "greater") == 0) CORRECTION = 0.5;
1644
3
                                else if (strcmp(alternative, "less") == 0) CORRECTION = -0.5;
1645
3
                        }
1646
0
                        z = (z - CORRECTION) / sqrt(var);
1647
1648
3
                        if (strcmp(alternative, "less") == 0) p_value = approx_pnorm(z);
1649
0
                        else if (strcmp(alternative, "greater") == 0) p_value = 1.0 - approx_pnorm(z);
1650
0
                        else p_value = 2.0 * approx_pnorm(-fabs(z));
1651
3
                }
1652                Safefree(ri);
1653
3
        } else { // --- ONE SAMPLE / PAIRED ---
1654                if (paired && (!y_av || nx != ny)) croak("'x' and 'y' must have the same length for paired test");
1655
3
                double *restrict diffs = (double *)safemalloc(nx * sizeof(double));
1656
3
                size_t n_nz = 0;
1657
3
                bool has_zeroes = FALSE;
1658                for (size_t i = 0; i < nx; i++) {
1659
12
                        SV**restrict x_el = av_fetch(x_av, i, 0);
1660                        if (!x_el || !SvOK(*x_el) || !looks_like_number(*x_el)) continue;
1661
27
                        double dx = SvNV(*x_el);
1662
1663
24
                        if (paired) {
1664
24
                                SV**restrict y_el = av_fetch(y_av, i, 0);
1665
24
                                if (!y_el || !SvOK(*y_el) || !looks_like_number(*y_el)) continue;
1666
24
                                double dy = SvNV(*y_el);
1667
24
                                double d = dx - dy - mu;
1668                                if (d == 0.0) has_zeroes = TRUE; // Drop exact zeroes
1669                                else diffs[n_nz++] = d;
1670                        } else {
1671                                double d = dx - mu;
1672                                if (d == 0.0) has_zeroes = TRUE;
1673                                else diffs[n_nz++] = d;
1674                        }
1675                }
1676
9
                if (n_nz == 0) {
1677
9
                        Safefree(diffs);
1678
9
                        croak("not enough (non-missing) observations");
1679
9
                }
1680
9
                RankInfo *ri = (RankInfo *)safemalloc(n_nz * sizeof(RankInfo));
1681
6
                for (size_t i = 0; i < n_nz; i++) {
1682
6
                        ri[i].val = fabs(diffs[i]);
1683
6
                        ri[i].idx = (diffs[i] > 0);
1684                }
1685
6
                bool has_ties = 0;
1686
11
                double tie_adj = rank_and_count_ties(ri, n_nz, &has_ties);
1687                statistic = 0.0;
1688                for (size_t i = 0; i < n_nz; i++) {
1689
17
                        if (ri[i].idx) statistic += ri[i].rank;
1690
17
                }
1691
17
                if (exact == 1) use_exact = TRUE;
1692                else if (exact == 0) use_exact = FALSE;
1693
17
                else use_exact = (n_nz < 50 && !has_ties);
1694
17
                if (use_exact && has_ties) {
1695
14
                        warn("cannot compute exact p-value with ties; falling back to approximation");
1696
9
                        use_exact = FALSE;
1697
21
                }
1698
24
                if (use_exact && has_zeroes) {
1699
21
                        warn("cannot compute exact p-value with zeroes; falling back to approximation");
1700
15
                        use_exact = FALSE;
1701
14
                }
1702
44
                if (use_exact) {
1703
32
                        method_desc = paired ? "Wilcoxon exact signed rank test" : "Wilcoxon exact signed rank test";
1704
31
                        double p_less = exact_psignrank(statistic, n_nz);
1705
31
                        double p_greater = 1.0 - exact_psignrank(statistic - 1.0, n_nz);
1706
1707
33
                        if (strcmp(alternative, "less") == 0) p_value = p_less;
1708                        else if (strcmp(alternative, "greater") == 0) p_value = p_greater;
1709                        else {
1710
21
                                double p = (p_less < p_greater) ? p_less : p_greater;
1711
15
                                p_value = 2.0 * p;
1712
15
                        }
1713
14
                } else {
1714
44
                        method_desc = correct ? "Wilcoxon signed rank test with continuity correction" : "Wilcoxon signed rank test";
1715
36
                        double exp = (double)n_nz * (n_nz + 1.0) / 4.0;
1716
37
                        double var = (n_nz * (n_nz + 1.0) * (2.0 * n_nz + 1.0) / 24.0) - (tie_adj / 48.0);
1717
36
                        double z = statistic - exp;
1718
34
                        double CORRECTION = 0.0;
1719
34
                        if (correct) {
1720                                if (strcmp(alternative, "two.sided") == 0) CORRECTION = (z > 0 ? 0.5 : -0.5);
1721
26
                                else if (strcmp(alternative, "greater") == 0) CORRECTION = 0.5;
1722
22
                                else if (strcmp(alternative, "less") == 0) CORRECTION = -0.5;
1723
22
                        }
1724
22
                        z = (z - CORRECTION) / sqrt(var);
1725
1726
28
                        if (strcmp(alternative, "less") == 0) p_value = approx_pnorm(z);
1727                        else if (strcmp(alternative, "greater") == 0) p_value = 1.0 - approx_pnorm(z);
1728                        else p_value = 2.0 * approx_pnorm(-fabs(z));
1729
22
                }
1730                Safefree(ri); Safefree(diffs);
1731
12
        }
1732
10
        if (p_value > 1.0) p_value = 1.0;
1733        HV *restrict res = newHV();
1734
16
        hv_stores(res, "statistic", newSVnv(statistic));
1735
13
        hv_stores(res, "p_value", newSVnv(p_value));
1736
23
        hv_stores(res, "method", newSVpv(method_desc, 0));
1737        hv_stores(res, "alternative", newSVpv(alternative, 0));
1738
13
        RETVAL = newRV_noinc((SV*)res);
1739
22
}
1740
19
OUTPUT:
1741
19
        RETVAL
1742
1743
13
SV* _chisq_c(data_ref)
1744    SV* data_ref;
1745
7
CODE:
1746{
1747
13
        AV*restrict obs_av = (AV*)SvRV(data_ref);
1748
13
        int r = av_top_index(obs_av) + 1, c = 0;
1749
15
        bool is_2d = 0;
1750
13
        SV**restrict first_elem = av_fetch(obs_av, 0, 0);
1751
11
        if (first_elem && SvROK(*first_elem) && SvTYPE(SvRV(*first_elem)) == SVt_PVAV) {
1752
11
                is_2d = 1;
1753
13
                AV*restrict first_row = (AV*)SvRV(*first_elem);
1754
9
                c = av_top_index(first_row) + 1;
1755
6
        } else {
1756                c = r;
1757
4
                r = 1;
1758        }
1759
1760
7
        double stat = 0.0, grand_total = 0.0;
1761        int df = 0;
1762
12
        int yates = (is_2d && r == 2 && c == 2) ? 1 : 0;
1763
1764        AV*restrict expected_av = newAV();
1765        if (is_2d) {
1766                double *restrict row_sum = (double*)safemalloc(r * sizeof(double));
1767                double *restrict col_sum = (double*)safemalloc(c * sizeof(double));
1768                for(unsigned int i=0; i<r; i++) row_sum[i] = 0.0;
1769                for(unsigned int j=0; j<c; j++) col_sum[j] = 0.0;
1770                for (unsigned int i = 0; i < r; i++) {
1771                        SV**restrict row_sv = av_fetch(obs_av, i, 0);
1772
40
                        AV*restrict row = (AV*)SvRV(*row_sv);
1773
40
                        for (unsigned int j = 0; j < c; j++) {
1774
40
                                 SV**restrict val_sv = av_fetch(row, j, 0);
1775                                 double val = SvNV(*val_sv);
1776                                 row_sum[i] += val;
1777
38
                                 col_sum[j] += val;
1778
40
                                 grand_total += val;
1779
40
                        }
1780
40
                }
1781
40
                for (unsigned int i = 0; i < r; i++) {
1782                        AV*restrict exp_row = newAV();
1783                        SV**restrict row_sv = av_fetch(obs_av, i, 0);
1784
40
                        AV*restrict row = (AV*)SvRV(*row_sv);
1785
40
                        for (unsigned int j = 0; j < c; j++) {
1786
40
                                double E = (row_sum[i] * col_sum[j]) / grand_total;
1787                                SV**restrict val_sv = av_fetch(row, j, 0);
1788                                double O = SvNV(*val_sv);
1789
39
                                av_push(exp_row, newSVnv(E));
1790
38
                                if (yates) {
1791
38
                                  // Exact R logic: min(0.5, abs(O - E))
1792                                  double abs_diff = fabs(O - E);
1793                                  double y_corr = (abs_diff > 0.5) ? 0.5 : abs_diff;
1794
105
                                  double diff = abs_diff - y_corr;
1795
70
                                  stat += (diff * diff) / E;
1796
80
                                } else {
1797
80
                                  stat += ((O - E) * (O - E)) / E;
1798
80
                                }
1799
80
                        }
1800
68
                        av_push(expected_av, newRV_noinc((SV*)exp_row));
1801
68
                }
1802
44
                safefree(row_sum); safefree(col_sum);
1803
13
                df = (r - 1) * (c - 1);
1804        } else {
1805          for (unsigned int j = 0; j < c; j++) {
1806
50
                   SV**restrict val_sv = av_fetch(obs_av, j, 0);
1807
13
                   grand_total += SvNV(*val_sv);
1808          }
1809
50
          double E = grand_total / (double)c;
1810
50
          for (unsigned int j = 0; j < c; j++) {
1811
13
                   SV**restrict val_sv = av_fetch(obs_av, j, 0);
1812                   double O = SvNV(*val_sv);
1813                   av_push(expected_av, newSVnv(E));
1814
50
                   stat += ((O - E) * (O - E)) / E;
1815
74
          }
1816          df = c - 1;
1817
61
        }
1818
36
        double p_val = get_p_value(stat, df);
1819
27
        HV*restrict results = newHV();
1820        hv_store(results, "statistic", 9, newSVnv(stat), 0);
1821        hv_store(results, "df", 2, newSViv(df), 0);
1822        hv_store(results, "p_value", 7, newSVnv(p_val), 0);
1823
58
        hv_store(results, "expected", 8, newRV_noinc((SV*)expected_av), 0);
1824
58
        if (is_2d) {
1825                if (yates) {
1826                        hv_store(results, "method", 6, newSVpv("Pearson's Chi-squared test with Yates' continuity correction", 0), 0);
1827
53
                } else {
1828
44
                        hv_store(results, "method", 6, newSVpv("Pearson's Chi-squared test", 0), 0);
1829
36
                }
1830        } else {
1831
25
          hv_store(results, "method", 6, newSVpv("Chi-squared test for given probabilities", 0), 0);
1832
38
        }
1833
25
        RETVAL = newRV_noinc((SV*)results);
1834
13
}
1835OUTPUT:
1836
38
        RETVAL
1837
1838
13
PROTOTYPES: ENABLE
1839
1840void write_table(...)
1841
38
PPCODE:
1842
38
{
1843        SV *restrict data_sv = NULL;
1844
30
        SV *restrict file_sv = NULL;
1845
89
        unsigned int arg_idx = 0;
1846
1847
75
        // Mimic the Perl shift logic
1848
12
        if (arg_idx < items && SvROK(ST(arg_idx))) {
1849        int type = SvTYPE(SvRV(ST(arg_idx)));
1850        if (type == SVt_PVHV || type == SVt_PVAV) {
1851                  data_sv = ST(arg_idx);
1852
34
                  arg_idx++;
1853
21
        }
1854
21
        }
1855
45
        if (arg_idx < items) {
1856
33
        file_sv = ST(arg_idx);
1857        arg_idx++;
1858        }
1859
1860
9
        const char *restrict sep = ",";
1861
18
        const char *restrict undef_val = "NA";
1862
18
        SV *restrict row_names_sv = sv_2mortal(newSViv(1));
1863
9
        SV *restrict col_names_sv = NULL;
1864
1865        // Read the remaining Hash-style arguments
1866        for (; arg_idx < items; arg_idx += 2) {
1867
39
        if (arg_idx + 1 >= items) croak("write_table: Odd number of arguments passed");
1868
30
        const char *restrict key = SvPV_nolen(ST(arg_idx));
1869
53
        SV *restrict val = ST(arg_idx + 1);
1870
23
        if (strEQ(key, "data")) data_sv = val;
1871        else if (strEQ(key, "col.names")) col_names_sv = val;
1872        else if (strEQ(key, "file")) file_sv = val;
1873
32
        else if (strEQ(key, "row.names")) row_names_sv = val;
1874        else if (strEQ(key, "sep")) sep = SvPV_nolen(val);
1875        else if (strEQ(key, "undef.val")) undef_val = SvPV_nolen(val);
1876
34
        else croak("write_table: Unknown arguments passed: %s", key);
1877
43
        }
1878
1879
38
        if (!data_sv || !SvROK(data_sv)) {
1880
38
        croak("write_table: 'data' must be a HASH or ARRAY reference\n");
1881
46
        }
1882        SV *restrict data_ref = SvRV(data_sv);
1883        if (SvTYPE(data_ref) != SVt_PVHV && SvTYPE(data_ref) != SVt_PVAV) {
1884
42
        croak("write_table: 'data' must be a HASH or ARRAY reference\n");
1885
18
        }
1886
1887
15
        if (!file_sv || !SvOK(file_sv)) croak("write_table: file name missing\n");
1888
12
        const char *restrict file = SvPV_nolen(file_sv);
1889
1890        if (col_names_sv && SvOK(col_names_sv)) {
1891        if (!SvROK(col_names_sv) || SvTYPE(SvRV(col_names_sv)) != SVt_PVAV) {
1892
19
                  croak("write_table: 'col.names' must be an ARRAY reference\n");
1893
16
        }
1894        }
1895
1896
18
        bool is_hoh = 0, is_hoa = 0, is_aoh = 0;
1897
21
        AV *restrict rows_av = NULL;
1898
1899
66
        // Validate Input Structures & Homogeneity
1900
48
        if (SvTYPE(data_ref) == SVt_PVHV) {
1901          HV *restrict hv = (HV*)data_ref;
1902          if (hv_iterinit(hv) == 0) XSRETURN_EMPTY;
1903
1904
21
          HE *restrict entry = hv_iternext(hv);
1905
48
          SV *restrict first_val = hv_iterval(hv, entry);
1906
39
          if (!first_val || !SvROK(first_val)) {
1907
32
                   croak("write_table: Data values must be either all HASHes or all ARRAYs\n");
1908          }
1909
10
          int first_type = SvTYPE(SvRV(first_val));
1910
40
          if (first_type != SVt_PVHV && first_type != SVt_PVAV) {
1911
12
                   croak("write_table: Data values must be either all HASHes or all ARRAYs\n");
1912
12
          }
1913          is_hoh = (first_type == SVt_PVHV);
1914
15
          is_hoa = (first_type == SVt_PVAV);
1915
15
          hv_iterinit(hv);
1916          while ((entry = hv_iternext(hv))) {
1917
21
                   SV *restrict val = hv_iterval(hv, entry);
1918
18
                   if (!val || !SvROK(val) || SvTYPE(SvRV(val)) != first_type) {
1919
54
                        croak("write_table: Mixed data types detected. Ensure all values are %s references.\n", is_hoh ? "HASH" : "ARRAY");
1920
54
                   }
1921
48
          }
1922          if (is_hoh) {
1923
15
                   rows_av = newAV();
1924
15
                   hv_iterinit(hv);
1925                   while ((entry = hv_iternext(hv))) {
1926
24
                        av_push(rows_av, newSVsv(hv_iterkeysv(entry)));
1927
21
                   }
1928
45
          }
1929
27
        } else {
1930          AV *restrict av = (AV*)data_ref;
1931
24
          if (av_len(av) < 0) XSRETURN_EMPTY;
1932          SV **restrict first_ptr = av_fetch(av, 0, 0);
1933
15
          if (!first_ptr || !*first_ptr || !SvROK(*first_ptr) || SvTYPE(SvRV(*first_ptr)) != SVt_PVHV) {
1934
15
                   croak("write_table: For ARRAY data, all elements must be HASH references (Array of Hashes)\n");
1935          }
1936
1937
28
          for (size_t i = 0; i <= av_len(av); i++) {
1938
28
                   SV **restrict ptr = av_fetch(av, i, 0);
1939                   if (!ptr || !*ptr || !SvROK(*ptr) || SvTYPE(SvRV(*ptr)) != SVt_PVHV) {
1940
28
                        croak("write_table: Mixed data types detected in Array of Hashes. All elements must be HASH references.\n");
1941
40
                   }
1942          }
1943
111
          is_aoh = 1;
1944
90
        }
1945
1946
82
        PerlIO *restrict fh = PerlIO_open(file, "w");
1947
82
        if (!fh) croak("write_table: Could not open '%s' for writing", file);
1948
1949
15
        AV *restrict headers_av = newAV();
1950
11
        bool inc_rownames = (row_names_sv && SvTRUE(row_names_sv)) ? 1 : 0;
1951
7
        const char *restrict rownames_col = NULL;
1952
1953
7
        // ----- Hash of Hashes -----
1954
14
        if (is_hoh) {
1955                if (col_names_sv && SvOK(col_names_sv)) {
1956
50
                        AV *restrict c_av = (AV*)SvRV(col_names_sv);
1957                        for(size_t i=0; i<=av_len(c_av); i++) {
1958
41
                                SV **restrict c = av_fetch(c_av, i, 0);
1959                                if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
1960                        }
1961
29
                } else {
1962                        HV *restrict col_map = newHV();
1963
17
                        hv_iterinit((HV*)data_ref);
1964                        HE *restrict entry;
1965
55
                        while((entry = hv_iternext((HV*)data_ref))) {
1966
39
                                 HV *restrict inner = (HV*)SvRV(hv_iterval((HV*)data_ref, entry));
1967
39
                                 hv_iterinit(inner);
1968
39
                                 HE *restrict inner_entry;
1969                                 while((inner_entry = hv_iternext(inner))) {
1970
78
                                     hv_store_ent(col_map, hv_iterkeysv(inner_entry), newSViv(1), 0);
1971
54
                                 }
1972
40
                        }
1973
40
                        unsigned num_cols = hv_iterinit(col_map);
1974                        const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
1975                        for(unsigned i=0; i<num_cols; i++) {
1976
17
                                 HE *restrict ce = hv_iternext(col_map);
1977
4
                                 col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
1978
13
                        }
1979
10
                        qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
1980
23
                        for(unsigned i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
1981                        safefree(col_array);
1982                        SvREFCNT_dec(col_map);
1983
21
        }
1984
17
        size_t num_headers = av_len(headers_av) + 1;
1985
43
        const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
1986
1987
35
        size_t h_idx = 0;
1988        if (inc_rownames) header_row[h_idx++] = "";
1989
15
        for(unsigned short int i=0; i<num_headers; i++) {
1990
45
                  SV**restrict h_ptr = av_fetch(headers_av, i, 0);
1991
30
                  header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
1992        }
1993
28
        print_string_row(fh, header_row, h_idx, sep);
1994
28
        safefree(header_row);
1995
1996
7
        size_t num_rows = av_len(rows_av) + 1;
1997        const char **restrict row_array = safemalloc(num_rows * sizeof(char*));
1998
2
        for(size_t i=0; i<num_rows; i++) {
1999
8
                 row_array[i] = SvPV_nolen(*av_fetch(rows_av, i, 0));
2000
6
        }
2001
6
        qsort(row_array, num_rows, sizeof(char*), cmp_string_wt);
2002
2003
3
        HV *restrict data_hv = (HV*)data_ref;
2004        const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
2005
2006
12
        for(size_t i=0; i<num_rows; i++) {
2007
9
                  size_t d_idx = 0;
2008                  if (inc_rownames) row_data[d_idx++] = row_array[i];
2009
2010
16
                  SV **restrict inner_hv_ptr = hv_fetch(data_hv, row_array[i], strlen(row_array[i]), 0);
2011
25
                  HV *restrict inner_hv = inner_hv_ptr ? (HV*)SvRV(*inner_hv_ptr) : NULL;
2012
2013
57
                  for(size_t j=0; j<num_headers; j++) {
2014
44
                      SV**restrict h_ptr = av_fetch(headers_av, j, 0);
2015
39
                      const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2016                      SV **restrict cell_ptr = inner_hv ? hv_fetch(inner_hv, col_name, strlen(col_name), 0) : NULL;
2017
13
                      if (cell_ptr && SvOK(*cell_ptr)) {
2018
13
                          if (SvROK(*cell_ptr)) {
2019
13
                              PerlIO_close(fh);
2020
78
                              safefree(row_array);
2021
65
                            safefree(row_data);
2022
65
                              if (headers_av) SvREFCNT_dec(headers_av);
2023
53
                              if (rows_av) SvREFCNT_dec(rows_av);
2024
0
                              croak("write_table: Cannot write nested reference types to table\n");
2025
0
                          }
2026
5
                          row_data[d_idx++] = SvPV_nolen(*cell_ptr);
2027
5
                      } else {
2028
5
                          row_data[d_idx++] = undef_val;
2029
5
                      }
2030
20
                  }
2031
15
                  print_string_row(fh, row_data, d_idx, sep);
2032
15
        }
2033
5
        safefree(row_array); safefree(row_data);
2034
2035
5
        } else if (is_hoa) { // ----- Hash of Arrays -----
2036          HV *restrict data_hv = (HV*)data_ref;
2037
5
          size_t max_rows = 0;
2038          hv_iterinit(data_hv);
2039          HE *restrict entry;
2040
32
          while((entry = hv_iternext(data_hv))) {
2041                   AV *restrict arr = (AV*)SvRV(hv_iterval(data_hv, entry));
2042                   size_t len = av_len(arr) + 1;
2043                   if (len > max_rows) max_rows = len;
2044
80
          }
2045
2046          if (col_names_sv && SvOK(col_names_sv)) {
2047                   AV *restrict c_av = (AV*)SvRV(col_names_sv);
2048
283
                   for(size_t i=0; i<=av_len(c_av); i++) {
2049
195
                        SV **restrict c = av_fetch(c_av, i, 0);
2050
195
                        if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
2051
195
                   }
2052
390
          } else {
2053
195
                   unsigned int num_cols = hv_iterinit(data_hv);
2054
195
                   const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
2055
195
                   for(unsigned int i=0; i<num_cols; i++) {
2056
104
                        HE *restrict ce = hv_iternext(data_hv);
2057
0
                        col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
2058
0
                   }
2059
0
                   qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
2060
0
                   for(unsigned i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
2061                   safefree(col_array);
2062
104
          }
2063          if (av_len(headers_av) < 0) croak("Could not get headers in write_table");
2064
114
          if (inc_rownames && contains_nondigit(row_names_sv)) {
2065                   rownames_col = SvPV_nolen(row_names_sv);
2066                   AV *restrict filtered_headers = (AV*)sv_2mortal((SV*)newAV());
2067
2068                   for(size_t i=0; i<=av_len(headers_av); i++) {
2069                        SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2070
173
                        if (!h_ptr || !*h_ptr) continue;
2071
146
                        SV *restrict h_sv = *h_ptr;
2072                        if (strcmp(SvPV_nolen(h_sv), rownames_col) != 0) {
2073
94
                            av_push(filtered_headers, newSVsv(h_sv));
2074
90
                        }
2075
171
                   }
2076
90
                   SvREFCNT_dec(headers_av);
2077
93
                   headers_av = filtered_headers;
2078
84
          }
2079
55
          size_t num_headers = av_len(headers_av) + 1;
2080
6
          const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
2081
6
          size_t h_idx = 0;
2082          if (inc_rownames) header_row[h_idx++] = "";
2083          for(size_t i=0; i<num_headers; i++) {
2084
6
                   SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2085
21
                   header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2086
61
          }
2087
50
          print_string_row(fh, header_row, h_idx, sep);
2088
15
          safefree(header_row);
2089
42
          const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
2090          for(size_t i=0; i<max_rows; i++) {
2091
69
                   size_t d_idx = 0;
2092
32
                   if (inc_rownames) {
2093                        if (rownames_col) {
2094                            SV **restrict rn_arr_ptr = hv_fetch(data_hv, rownames_col, strlen(rownames_col), 0);
2095                            if (rn_arr_ptr && SvROK(*rn_arr_ptr)) {
2096
9
                                AV *restrict rn_arr = (AV*)SvRV(*rn_arr_ptr);
2097
9
                                SV **restrict rn_val_ptr = av_fetch(rn_arr, i, 0);
2098
24
                                if (rn_val_ptr && SvOK(*rn_val_ptr)) {
2099
19
                                    if (SvROK(*rn_val_ptr)) {
2100
16
                                        PerlIO_close(fh);
2101                                        safefree(row_data);
2102
9
                                        if (headers_av) SvREFCNT_dec(headers_av);
2103
23
                                        croak("write_table: Cannot write nested reference types to table\n");
2104
8
                                    }
2105
8
                                    row_data[d_idx++] = SvPV_nolen(*rn_val_ptr);
2106                                } else {
2107
16
                                    row_data[d_idx++] = undef_val;
2108
5
                                }
2109
5
                            } else {
2110
5
                                row_data[d_idx++] = undef_val;
2111
5
                            }
2112
14
                        } else {
2113
9
                            char buf[32];
2114
2
                            snprintf(buf, sizeof(buf), "%ld", (long)(i + 1));
2115
2
                            row_data[d_idx++] = savepv(buf);
2116                        }
2117                   }
2118
7
                   for(size_t j=0; j<num_headers; j++) {
2119
5
                        SV**restrict h_ptr = av_fetch(headers_av, j, 0);
2120                        const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2121
14
                        SV **restrict arr_ptr = hv_fetch(data_hv, col_name, strlen(col_name), 0);
2122
11
                        if (arr_ptr && SvROK(*arr_ptr)) {
2123
16
                            AV *restrict arr = (AV*)SvRV(*arr_ptr);
2124
11
                            SV **restrict cell_ptr = av_fetch(arr, i, 0);
2125
32
                            if (cell_ptr && SvOK(*cell_ptr)) {
2126
24
                                if (SvROK(*cell_ptr)) {
2127
21
                                    PerlIO_close(fh);
2128                                    safefree(row_data);
2129
9
                                    if (headers_av) SvREFCNT_dec(headers_av);
2130
9
                                    croak("write_table: Cannot write nested reference types to table\n");
2131
9
                                }
2132
30
                                row_data[d_idx++] = SvPV_nolen(*cell_ptr);
2133
21
                            } else {
2134
21
                                row_data[d_idx++] = undef_val;
2135
21
                            }
2136
21
                        } else {
2137
9
                            row_data[d_idx++] = undef_val;
2138
3
                        }
2139
3
                   }
2140
3
                   print_string_row(fh, row_data, d_idx, sep);
2141
3
                   if (inc_rownames && !rownames_col) safefree((char*)row_data[0]);
2142
10
          }
2143
7
          safefree(row_data);
2144
7
        } else if (is_aoh) {// ----- Array of Hashes -----
2145          AV *restrict data_av = (AV*)data_ref;
2146
3
          size_t num_rows = av_len(data_av) + 1;
2147          if (col_names_sv && SvOK(col_names_sv)) {
2148
3
                   AV *restrict c_av = (AV*)SvRV(col_names_sv);
2149                   for(size_t i=0; i<=av_len(c_av); i++) {
2150                        SV **restrict c = av_fetch(c_av, i, 0);
2151                        if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
2152
12
                   }
2153
19
          } else {
2154                   HV *restrict col_map = newHV();
2155                   for(size_t i=0; i<num_rows; i++) {
2156                        SV **restrict row_ptr = av_fetch(data_av, i, 0);
2157
79
                        if (row_ptr && SvROK(*row_ptr)) {
2158
58
                            HV *restrict row_hv = (HV*)SvRV(*row_ptr);
2159
58
                            hv_iterinit(row_hv);
2160
58
                            HE *restrict entry;
2161
54
                            while((entry = hv_iternext(row_hv))) {
2162
39
                                hv_store_ent(col_map, hv_iterkeysv(entry), newSViv(1), 0);
2163
0
                            }
2164
0
                        }
2165
0
                   }
2166
0
                   unsigned num_cols = hv_iterinit(col_map);
2167                   const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
2168
39
                   for(unsigned int i=0; i<num_cols; i++) {
2169                        HE *restrict ce = hv_iternext(col_map);
2170
12
                        col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
2171                   }
2172                   qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
2173
21
                   for(unsigned int i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
2174
21
                   safefree(col_array);
2175                   SvREFCNT_dec(col_map);
2176
12
          }
2177          if (inc_rownames && contains_nondigit(row_names_sv)) {
2178
34
                   rownames_col = SvPV_nolen(row_names_sv);
2179
55
                   AV *restrict filtered_headers = newAV();
2180
48
                   for(size_t i=0; i<=av_len(headers_av); i++) {
2181
48
                        SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2182                        if (!h_ptr || !*h_ptr) continue;
2183                        SV *restrict h_sv = *h_ptr;
2184                        if (strcmp(SvPV_nolen(h_sv), rownames_col) != 0) {
2185                            av_push(filtered_headers, newSVsv(h_sv));
2186                        }
2187                   }
2188
56
                   SvREFCNT_dec(headers_av);
2189
56
                   headers_av = filtered_headers;
2190
52
          }
2191
39
          size_t num_headers = av_len(headers_av) + 1;
2192          const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
2193          size_t h_idx = 0;
2194
39
          if (inc_rownames) header_row[h_idx++] = "";
2195          for(size_t i=0; i<num_headers; i++) {
2196
39
                   SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2197
39
                   header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2198          }
2199
13
          print_string_row(fh, header_row, h_idx, sep);
2200          safefree(header_row);
2201
43
          const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
2202
46
          for(size_t i=0; i<num_rows; i++) {
2203                   size_t d_idx = 0;
2204
46
                   SV **restrict row_ptr = av_fetch(data_av, i, 0);
2205
42
                   HV *restrict row_hv = (row_ptr && SvROK(*row_ptr)) ? (HV*)SvRV(*row_ptr) : NULL;
2206
11
                   if (inc_rownames) {
2207                       if (rownames_col) {
2208
50
                         SV **restrict rn_val_ptr = row_hv ? hv_fetch(row_hv, rownames_col, strlen(rownames_col), 0) : NULL;
2209                         if (rn_val_ptr && SvOK(*rn_val_ptr)) {
2210
11936
                               if (SvROK(*rn_val_ptr)) {
2211
11897
                                    PerlIO_close(fh);
2212
11899
                                         safefree(row_data);
2213                                         if (headers_av) SvREFCNT_dec(headers_av);
2214
11899
                                    croak("write_table: Cannot write nested reference types to table\n");
2215
11893
                               }
2216
11893
                               row_data[d_idx++] = SvPV_nolen(*rn_val_ptr);
2217
11101
                         } else {
2218                               row_data[d_idx++] = undef_val;
2219                         }
2220
11899
                       } else {
2221                         char buf[32];
2222
11896
                         snprintf(buf, sizeof(buf), "%ld", (long)(i + 1));
2223
11886
                         row_data[d_idx++] = savepv(buf);
2224
11899
                       }
2225                   }
2226
2227                   for(size_t j=0; j<num_headers; j++) {
2228                        SV**restrict h_ptr = av_fetch(headers_av, j, 0);
2229
11896
                        const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2230
13
                        SV **restrict cell_ptr = row_hv ? hv_fetch(row_hv, col_name, strlen(col_name), 0) : NULL;
2231                        if (cell_ptr && SvOK(*cell_ptr)) {
2232                            if (SvROK(*cell_ptr)) {
2233                                PerlIO_close(fh);
2234
880764
                                safefree(row_data);
2235
868891
                                if (headers_av) SvREFCNT_dec(headers_av);
2236
872853
                                croak("write_table: Cannot write nested reference types to table\n");
2237
872837
                            }
2238
70937
                            row_data[d_idx++] = SvPV_nolen(*cell_ptr);
2239
3971
                        } else {
2240
3969
                            row_data[d_idx++] = undef_val;
2241
70926
                        }
2242
37176
                   }
2243
37442
                   print_string_row(fh, row_data, d_idx, sep);
2244
37447
                   if (inc_rownames && !rownames_col) safefree((char*)row_data[0]);
2245
37445
          }
2246          safefree(row_data);
2247
805862
        }
2248
158806
        if (headers_av) SvREFCNT_dec(headers_av);
2249
158806
        if (rows_av) SvREFCNT_dec(rows_av);
2250
154845
        PerlIO_close(fh);
2251
448433
        XSRETURN_EMPTY;
2252}
2253
2254SV*
2255_parse_csv_file(char* file, const char* sep_str, const char* comment_str, SV* callback = &PL_sv_undef)
2256
301512
INIT:
2257        PerlIO *restrict fp;
2258
289631
        AV *restrict data = NULL;
2259        AV *restrict current_row = newAV();
2260
34205
        SV *restrict field = newSVpvs("");
2261        bool in_quotes = 0, post_quote = 0;
2262
11883
        size_t sep_len, comment_len;
2263
11883
        SV *restrict line_sv;
2264        bool use_cb = 0;
2265
34202
CODE:
2266
23040
        if (SvOK(callback) && SvROK(callback) && SvTYPE(SvRV(callback)) == SVt_PVCV) {
2267
23040
                use_cb = 1;
2268
23042
        } else {
2269
23041
                data = newAV();
2270
279180
        }
2271
63495
        sep_len = sep_str ? strlen(sep_str) : 0;
2272
63495
        comment_len = comment_str ? strlen(comment_str) : 0;
2273
2274
63495
        fp = PerlIO_open(file, "r");
2275
227565
        if (!fp) {
2276                croak("Could not open file '%s'", file);
2277
3962
        }
2278        line_sv = newSV_type(SVt_PV);
2279
11882
        // Read line by line using PerlIO
2280        while (sv_gets(line_sv, fp, 0) != NULL) {
2281                char *restrict line = SvPV_nolen(line_sv);
2282
3999
                size_t len = SvCUR(line_sv);
2283
3999
                // chomp \r\n (Handles Windows invisible \r natively)
2284                if (len > 0 && line[len-1] == '\n') {
2285
3999
                        len--;
2286
3963
                        if (len > 0 && line[len-1] == '\r') {
2287
3963
                                len--;
2288
3963
                        }
2289
3963
                }
2290
3963
                if (!in_quotes) {
2291
3963
                        // Skip completely empty lines (\h*[\r\n]+$ equivalent)
2292
3963
                        bool is_empty = 1;
2293
3963
                        for (size_t i = 0; i < len; i++) {
2294
3963
                                if (line[i] != ' ' && line[i] != '\t') { is_empty = 0; break; }
2295
3963
                        }
2296
3963
                        if (is_empty) continue;
2297
2298                        // Skip comments
2299
3960
                        if (comment_len > 0 && len >= comment_len && strncmp(line, comment_str, comment_len) == 0) {
2300                                continue;
2301
16
                        }
2302                }
2303
52
                // --- CORE PARSING MACHINE ---
2304
52
                for (size_t i = 0; i < len; i++) {
2305                        const char ch = line[i];
2306
40
                        if (ch == '\r') continue;
2307
40
                        if (ch == '"') {
2308                                if (in_quotes && (i + 1 < len) && line[i+1] == '"') {
2309
1
                                        sv_catpvn(field, "\"", 1);
2310                                        i++; // Skip the escaped second quote
2311                                } else if (in_quotes) {
2312                                        in_quotes = 0;  // Close quotes
2313                                        post_quote = 1;
2314                                } else if (!post_quote) {
2315                                        in_quotes = 1; // Open quotes (only when not in post-quote state)
2316                                }
2317                        } else if (!in_quotes && sep_len > 0 && (len - i) >= sep_len && strncmp(line + i, sep_str, sep_len) == 0) {
2318
13
                                av_push(current_row, newSVsv(field));
2319
1
                                sv_setpvs(field, ""); // Reset for next field
2320                                i += sep_len - 1;     // Advance past multi-char separators
2321
13
                                post_quote = 0;
2322
1
                        } else {
2323                                sv_catpvn(field, &ch, 1);
2324                        }
2325                }
2326
13
                if (in_quotes) {
2327
7
                        // Line ended but quotes are still open! Append newline and fetch next
2328
4
                        sv_catpvn(field, "\n", 1);
2329
1
                } else {
2330                        post_quote = 0; // Reset post-quote state at row boundary
2331                        // Push the final field of the record
2332
13
                        av_push(current_row, newSVsv(field));
2333
12
                        sv_setpvs(field, "");
2334
13
                        // If a callback is provided, invoke it in a streaming fashion
2335
25
                        if (use_cb) {
2336                                dSP;
2337
25
                                ENTER;
2338
13
                                SAVETMPS;
2339                                PUSHMARK(SP);
2340                                XPUSHs(sv_2mortal(newRV_inc((SV*)current_row)));
2341                                PUTBACK;
2342                                call_sv(callback, G_DISCARD);
2343                                FREETMPS;
2344
25
                                LEAVE;
2345
12
                                SvREFCNT_dec(current_row); // Frees the row from C memory if Perl didn't keep it
2346
16
                        } else {
2347                                av_push(data, newRV_noinc((SV*)current_row));
2348
72
                        }
2349
64
                        current_row = newAV();
2350
60
                }
2351        }
2352        PerlIO_close(fp);
2353
64
        SvREFCNT_dec(line_sv);
2354
2355        if (in_quotes) {
2356                av_push(current_row, newSVsv(field));
2357
61
                if (use_cb) {
2358
60
                        dSP;
2359
64
                        ENTER;
2360
64
                        SAVETMPS;
2361                        PUSHMARK(SP);
2362                        XPUSHs(sv_2mortal(newRV_inc((SV*)current_row)));
2363                        PUTBACK;
2364                        call_sv(callback, G_DISCARD);
2365
16
                        FREETMPS;
2366
4
                        LEAVE;
2367
4
                        SvREFCNT_dec(current_row);
2368
0
                } else {
2369                        av_push(data, newRV_noinc((SV*)current_row));
2370
16
                }
2371                current_row = newAV();
2372        }
2373
16
        SvREFCNT_dec(field);
2374        SvREFCNT_dec(current_row);
2375
22
        if (use_cb) {
2376
114
                RETVAL = &PL_sv_undef; // Memory was fully handled by callback stream
2377
95
        } else {
2378
95
                RETVAL = newRV_noinc((SV*)data);
2379
95
        }
2380OUTPUT:
2381        RETVAL
2382
2383
29
SV* cov(SV* x_sv, SV* y_sv, const char* method = "pearson")
2384        CODE:
2385
29
        {
2386                // 1. Validate inputs are Array References
2387
23
                if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) {
2388
23
                        croak("cov: first argument 'x' must be an ARRAY reference");
2389                }
2390                if (!SvROK(y_sv) || SvTYPE(SvRV(y_sv)) != SVt_PVAV) {
2391
23
                        croak("cov: second argument 'y' must be an ARRAY reference");
2392
7
                }
2393
2394
18
                // 2. Validate method argument
2395
15
                if (strcmp(method, "pearson") != 0 &&
2396
19
                        strcmp(method, "spearman") != 0 &&
2397
19
                        strcmp(method, "kendall") != 0) {
2398
21
                        croak("cov: unknown method '%s' (use 'pearson', 'spearman', or 'kendall')", method);
2399
45
                }
2400
2401                AV *restrict x_av = (AV*)SvRV(x_sv);
2402
28
                AV *restrict y_av = (AV*)SvRV(y_sv);
2403
28
                size_t nx = av_len(x_av) + 1;
2404                size_t ny = av_len(y_av) + 1;
2405
2406
61
                if (nx != ny) {
2407
33
                        croak("cov: incompatible dimensions (x has %lu, y has %lu)",
2408
33
                                   (unsigned long)nx, (unsigned long)ny);
2409
31
                }
2410
2411
31
                // 3. Extract Valid Pairwise Data
2412                // Allocate temporary C arrays for numeric processing
2413                double *restrict x_val = (double*)safemalloc(nx * sizeof(double));
2414                double *restrict y_val = (double*)safemalloc(nx * sizeof(double));
2415                size_t n = 0;
2416
2417                for (size_t i = 0; i < nx; i++) {
2418                        SV **restrict x_tv = av_fetch(x_av, i, 0);
2419
18
                        SV **restrict y_tv = av_fetch(y_av, i, 0);
2420
2421
17
                        // Extract numeric values, defaulting to NAN for missing/invalid data
2422                        double xv = (x_tv && SvOK(*x_tv) && looks_like_number(*x_tv)) ? SvNV(*x_tv) : NAN;
2423                        double yv = (y_tv && SvOK(*y_tv) && looks_like_number(*y_tv)) ? SvNV(*y_tv) : NAN;
2424
2425                        // Pairwise complete observations (skips NAs seamlessly like R)
2426                        if (!isnan(xv) && !isnan(yv)) {
2427                                 x_val[n] = xv;
2428                                 y_val[n] = yv;
2429                                 n++;
2430
26
                        }
2431
26
                }
2432
2433                // 4. Handle edge cases where data is too sparse
2434                if (n < 2) {
2435                        Safefree(x_val);        Safefree(y_val);
2436                        RETVAL = newSVnv(NAN);
2437
22
                } else {
2438
22
                        double ans = 0.0;                       
2439
33
                        // 5. Algorithm routing
2440
31
                        if (strcmp(method, "kendall") == 0) {
2441
31
                                 // R's default cov(..., method="kendall") iterates the full n x n space
2442
31
                                 for (size_t i = 0; i < n; i++) {
2443
31
                                     for (size_t j = 0; j < n; j++) {
2444
31
                                         int sx = (x_val[i] > x_val[j]) - (x_val[i] < x_val[j]);
2445
24
                                         int sy = (y_val[i] > y_val[j]) - (y_val[i] < y_val[j]);
2446                                         ans += (double)(sx * sy);
2447
25
                                     }
2448
25
                                 }
2449
25
                        } else {
2450
28
                                 double mean_x = 0.0, mean_y = 0.0, cov_sum = 0.0;
2451
28
                                 if (strcmp(method, "spearman") == 0) {
2452                                     // Spearman: Rank the data first, then run standard covariance
2453
28
                                     double *restrict rx = (double*)safemalloc(n * sizeof(double));
2454
28
                                     double *restrict ry = (double*)safemalloc(n * sizeof(double));
2455
28
                                     // Uses your existing rank_data() helper from LikeR.xs
2456
28
                                     rank_data(x_val, rx, n);
2457                                     rank_data(y_val, ry, n);
2458                                     for (size_t i = 0; i < n; i++) {
2459                                         double dx = rx[i] - mean_x;
2460                                         mean_x += dx / (i + 1);
2461                                         double dy = ry[i] - mean_y;
2462
28
                                         mean_y += dy / (i + 1);
2463                                         cov_sum += dx * (ry[i] - mean_y);
2464
85
                                     }
2465
64
                                     Safefree(rx); Safefree(ry);
2466
64
                                 } else {
2467
64
                                     // Pearson: Welford's Single-Pass Covariance Algorithm
2468
43
                                     for (size_t i = 0; i < n; i++) {
2469
22
                                         double dx = x_val[i] - mean_x;
2470
7
                                         mean_x += dx / (i + 1);
2471                                         double dy = y_val[i] - mean_y;
2472
28
                                         mean_y += dy / (i + 1);
2473
28
                                         cov_sum += dx * (y_val[i] - mean_y);
2474                                     }
2475
28
                                 }
2476
2477
28
                                 // Unbiased Sample Covariance (N - 1) for Pearson & Spearman
2478                                 ans = cov_sum / (n - 1);
2479                        }
2480
28
                        Safefree(x_val); Safefree(y_val);
2481
28
                        RETVAL = newSVnv(ans);
2482
28
                }
2483        }
2484
47
        OUTPUT:
2485
313
          RETVAL
2486
2487SV* glm(...)
2488
40
CODE:
2489
33
{
2490
26
        const char *restrict formula  = NULL;
2491
21
        SV *restrict data_sv = NULL;
2492        const char *restrict family_str = "gaussian";
2493
28
        char f_cpy[512];
2494
28
        char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
2495
2496
28
        // Dynamic Term Arrays
2497
67
        char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL;
2498
46
        bool *restrict is_dummy = NULL;
2499
7
        char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
2500
7
        unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
2501        size_t n = 0, valid_n = 0, i;
2502
46
        bool has_intercept = TRUE, converged = FALSE, boundary = FALSE;
2503
7
        unsigned int iter = 0, max_iter = 25, final_rank = 0, df_res = 0;
2504
98
        double deviance_old = 0.0, deviance_new = 0.0, null_dev = 0.0, aic = 0.0;
2505        double dispersion = 0.0, epsilon = 1e-8;
2506
2507
46
        char **restrict row_names = NULL;
2508
7
        char **restrict valid_row_names = NULL;
2509
7
        HV **restrict row_hashes = NULL;
2510
7
        HV *restrict data_hoa = NULL;
2511
7
        SV *restrict ref = NULL;
2512
2513
7
        double *restrict X = NULL, *restrict Y = NULL, *restrict mu = NULL, *restrict eta = NULL;
2514
7
        double *restrict W = NULL, *restrict Z = NULL, *restrict beta = NULL, *restrict beta_old = NULL;
2515
20
        bool *restrict aliased = NULL;
2516
13
        double *restrict XtWX = NULL, *restrict XtWZ = NULL;
2517
2518        HV *restrict res_hv, *restrict coef_hv, *restrict fitted_hv, *restrict resid_hv, *restrict summary_hv;
2519
39
        AV *restrict terms_av;
2520
52
        HE *restrict entry;
2521
2522        if (items % 2 != 0) croak("Usage: glm(formula => 'am ~ wt + hp', data => \\%mtcars)");
2523
2524        for (unsigned short i_arg = 0; i_arg < items; i_arg += 2) {
2525          const char *restrict key = SvPV_nolen(ST(i_arg));
2526
94
          SV *restrict val = ST(i_arg + 1);
2527
73
          if      (strEQ(key, "formula")) formula = SvPV_nolen(val);
2528
117
          else if (strEQ(key, "data"))    data_sv = val;
2529
57
          else if (strEQ(key, "family"))  family_str = SvPV_nolen(val);
2530          else croak("glm: unknown argument '%s'", key);
2531
60
        }        
2532        if (!formula) croak("glm: formula is required");
2533
21
        if (!data_sv || !SvROK(data_sv)) croak("glm: data is required and must be a reference");
2534
2535        bool is_binomial = (strcmp(family_str, "binomial") == 0);
2536
21
        bool is_gaussian = (strcmp(family_str, "gaussian") == 0);
2537
21
        if (!is_binomial && !is_gaussian) croak("glm: unsupported family '%s'", family_str);
2538
2539
21
        // --- Formula Parsing & Expansion ---
2540
21
        Newx(terms, term_cap, char*); Newx(uniq_terms, term_cap, char*);
2541
34
        Newx(exp_terms, exp_cap, char*); Newx(is_dummy, exp_cap, bool);
2542
34
        Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
2543
2544
19
        src = (char*)formula; dst = f_cpy;
2545
33
        while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
2546
26
        *dst = '\0';
2547
2548
379
        tilde = strchr(f_cpy, '~');
2549
380
        if (!tilde) croak("glm: invalid formula, missing '~'");
2550        *tilde = '\0';
2551
22
        lhs = f_cpy; rhs = tilde + 1;
2552
2553
22
        if (strstr(rhs, "-1")) has_intercept = FALSE;
2554
22
        if (has_intercept) terms[num_terms++] = savepv("Intercept");
2555
2556        chunk = strtok(rhs, "+");
2557
487
        while (chunk != NULL) {
2558
487
          if (num_terms >= term_cap - 3) {
2559
487
                   term_cap *= 2;
2560                   Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
2561
7
          }
2562          if (strcmp(chunk, "1") == 0 || strcmp(chunk, "-1") == 0) {
2563
2
                   chunk = strtok(NULL, "+");
2564
2
                   continue;
2565
2
          }
2566
122
          char *restrict star = strchr(chunk, '*');
2567
120
          if (star) {
2568
120
                   *star = '\0';
2569
5
                   char *restrict left = chunk; char *restrict right = star + 1;
2570
5
                   char *restrict c_l = strchr(left, '^'); if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
2571
5
                   char *restrict c_r = strchr(right, '^'); if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
2572
2573                   terms[num_terms++] = savepv(left);
2574
165
                   terms[num_terms++] = savepv(right);
2575
160
                   size_t inter_len = strlen(left) + strlen(right) + 2;
2576
160
                   terms[num_terms] = (char*)safemalloc(inter_len);
2577                   snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
2578          } else {
2579
160
                   char *restrict c_chunk = strchr(chunk, '^');
2580                   if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
2581                   terms[num_terms++] = savepv(chunk);
2582
81
          }
2583
60
          chunk = strtok(NULL, "+");
2584
0
        }
2585
2586
0
        for (i = 0; i < num_terms; i++) {
2587          bool found = FALSE;
2588
60
          for (size_t j = 0; j < num_uniq; j++) {
2589
21
                   if (strcmp(terms[i], uniq_terms[j]) == 0) { found = TRUE; break; }
2590          }
2591
39
          if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
2592
3
        }
2593
3
        p = num_uniq;
2594
2595
180
        // --- Data Extraction ---
2596
180
        ref = SvRV(data_sv);
2597
180
        if (SvTYPE(ref) == SVt_PVHV) {
2598
270
                HV*restrict hv = (HV*)ref;
2599
291
                if (hv_iterinit(hv) == 0) croak("glm: Data hash is empty");
2600                entry = hv_iternext(hv);
2601
200
                if (entry) {
2602
6
                        SV*restrict val = hv_iterval(hv, entry);
2603
6
                        if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
2604                                 data_hoa = hv;
2605
180
                                 n = av_len((AV*)SvRV(val)) + 1;
2606                                 Newx(row_names, n, char*);
2607                                 for(i = 0; i < n; i++) {
2608
23
                                     char buf[32]; snprintf(buf, sizeof(buf), "%lu", i+1);
2609
13
                                     row_names[i] = savepv(buf);
2610
19
                                 }
2611
4
                        } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
2612
4
                                 n = hv_iterinit(hv);
2613                                 Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
2614                                 i = 0;
2615                                 while ((entry = hv_iternext(hv))) {
2616
67
                                     I32 len;
2617
63
                                     row_names[i] = savepv(hv_iterkey(entry, &len));
2618
60
                                     row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
2619
60
                                     i++;
2620
90
                                 }
2621                        } else croak("glm: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
2622
91
                }
2623
63
        } else if (SvTYPE(ref) == SVt_PVAV) {
2624
5
          AV*restrict av = (AV*)ref;
2625
5
          n = av_len(av) + 1;
2626
63
          Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
2627          for (i = 0; i < n; i++) {
2628
10
                   SV**restrict val = av_fetch(av, i, 0);
2629
5
                   if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
2630                       row_hashes[i] = (HV*)SvRV(*val);
2631
2
                       char buf[32]; snprintf(buf, sizeof(buf), "%lu", i + 1);
2632                       row_names[i] = savepv(buf);
2633                   } else {
2634
37
                       for (size_t k = 0; k < i; k++) Safefree(row_names[k]);
2635                       Safefree(row_names); Safefree(row_hashes);
2636                       croak("glm: Array values must be HashRefs (AoH)");
2637
22
                   }
2638          }
2639
23
        } else croak("glm: Data must be an Array or Hash reference");
2640
2641        // --- Categorical Expansion ---
2642        for (size_t j = 0; j < p; j++) {
2643
861
          if (p_exp + 32 >= exp_cap) {
2644
840
                   exp_cap *= 2;
2645
840
                   Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
2646                   Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
2647
841
          }
2648
841
          if (strcmp(uniq_terms[j], "Intercept") == 0) {
2649
3181
                   exp_terms[p_exp] = savepv("Intercept"); is_dummy[p_exp] = FALSE; p_exp++; continue;
2650
2341
          }
2651
841
          if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
2652
1503
                   char **restrict levels = NULL; size_t num_levels = 0, levels_cap = 8;
2653
181
                   Newx(levels, levels_cap, char*);
2654
180
                   for (i = 0; i < n; i++) {
2655
192
                       char*restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
2656
187
                       if (str_val) {
2657
7
                           bool found = FALSE;
2658                           for (size_t l = 0; l < num_levels; l++) {
2659
1327
                               if (strcmp(levels[l], str_val) == 0) { found = TRUE; break; }
2660
1607
                           }
2661                           if (!found) {
2662                               if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
2663
1120
                               levels[num_levels++] = savepv(str_val);
2664
1120
                           }
2665
3460
                           Safefree(str_val);
2666
1120
                       }
2667
1900
                   }
2668
1620
                   if (num_levels > 0) {
2669                       for (size_t l1 = 0; l1 < num_levels - 1; l1++) {
2670
301
                           for (size_t l2 = l1 + 1; l2 < num_levels; l2++) {
2671
521
                               if (strcmp(levels[l1], levels[l2]) > 0) {
2672
60
                                   char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp;
2673
60
                               }
2674                           }
2675                       }
2676
81
                       for (size_t l = 1; l < num_levels; l++) {
2677
81
                           if (p_exp >= exp_cap) {
2678
21
                               exp_cap *= 2;
2679
461
                               Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
2680
461
                               Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
2681
361
                           }
2682                           size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
2683
301
                           exp_terms[p_exp] = (char*)safemalloc(t_len);
2684
1921
                           snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
2685
301
                           is_dummy[p_exp] = TRUE; dummy_base[p_exp] = savepv(uniq_terms[j]); dummy_level[p_exp] = savepv(levels[l]);
2686
1141
                           p_exp++;
2687
1120
                       }
2688
103
                       for (size_t l = 0; l < num_levels; l++) Safefree(levels[l]);
2689
103
                       Safefree(levels);
2690
96
                   } else {
2691
96
                       Safefree(levels); exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++;
2692
103
                   }
2693
46
          } else {
2694
7
                   exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++;
2695
103
          }
2696        }
2697
751
        p = p_exp;
2698
2699        Newx(X, n * p, double); Newx(Y, n, double);
2700        Newx(valid_row_names, n, char*);
2701
2702
70
        // --- Listwise Deletion ---
2703
2702
        for (size_t i = 0; i < n; i++) {
2704
2359
                double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
2705
1151
                if (isnan(y_val)) { Safefree(row_names[i]); continue; }
2706
2707
896
                bool row_ok = TRUE;
2708
896
                double *restrict row_x = (double*)safemalloc(p * sizeof(double));
2709
896
                for (size_t j = 0; j < p; j++) {
2710                        if (strcmp(exp_terms[j], "Intercept") == 0) {
2711
1520
                                 row_x[j] = 1.0;
2712
1520
                        } else if (is_dummy[j]) {
2713                                 char* str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
2714                                 if (str_val) {
2715                                     row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
2716
796
                                     Safefree(str_val);
2717
2415
                                 } else { row_ok = FALSE; break; }
2718
2384
                        } else {
2719
9296
                                 row_x[j] = evaluate_term(data_hoa, row_hashes, i, exp_terms[j]);
2720
6944
                                 if (isnan(row_x[j])) { row_ok = FALSE; break; }
2721
6717
                        }
2722
26869
                }
2723                if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
2724                Y[valid_n] = y_val;
2725
847
                for (size_t j = 0; j < p; j++) X[valid_n * p + j] = row_x[j];
2726
534
                valid_row_names[valid_n] = row_names[i];
2727
471
                valid_n++;
2728
471
                Safefree(row_x);
2729
1008
        }
2730
471
        Safefree(row_names);
2731        if (valid_n <= p) {
2732          Safefree(X); Safefree(Y); Safefree(valid_row_names); if (row_hashes) Safefree(row_hashes);
2733          croak("glm: 0 degrees of freedom (too many NAs or parameters > observations)");
2734
559
        }
2735
1189
        // --- R glm.fit IRLS Implementation ---
2736
891
        mu = (double*)safemalloc(valid_n * sizeof(double)); eta = (double*)safemalloc(valid_n * sizeof(double));
2737
24955
        W = (double*)safemalloc(valid_n * sizeof(double)); Z = (double*)safemalloc(valid_n * sizeof(double));
2738
24304
        beta = (double*)safemalloc(p * sizeof(double)); beta_old = (double*)safemalloc(p * sizeof(double));
2739
93496
        aliased = (bool*)safemalloc(p * sizeof(bool));
2740
25752
        XtWX = (double*)safemalloc(p * p * sizeof(double)); XtWZ = (double*)safemalloc(p * sizeof(double));
2741
25752
        for (i = 0; i < p; i++) { beta[i] = 0.0; beta_old[i] = 0.0; }
2742
17328
        // Initialize (mustart / etastart equivalent)
2743        double sum_y = 0.0;
2744
8661
        for (i = 0; i < valid_n; i++) sum_y += Y[i];
2745
8722
        double mean_y = sum_y / valid_n;
2746        for (i = 0; i < valid_n; i++) {
2747
8701
          if (is_binomial) {
2748
8701
                   if (Y[i] < 0.0 || Y[i] > 1.0) croak("glm: binomial family requires response between 0 and 1");
2749
3750
                   mu[i] = (Y[i] + 0.5) / 2.0;
2750
61
                   eta[i] = log(mu[i] / (1.0 - mu[i]));
2751
8661
                   double dev = 0.0;
2752                   if (Y[i] == 0.0)      dev = -2.0 * log(1.0 - mu[i]);
2753
15111
                   else if (Y[i] == 1.0) dev = -2.0 * log(mu[i]);
2754
15090
                   else dev = 2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i])));
2755
22930
                   deviance_old += dev;
2756          } else {
2757                   mu[i] = mean_y; // R gaussian init
2758                   eta[i] = mu[i];
2759
8470
          }
2760
30760
        }
2761        // IRLS Loop
2762        for (iter = 1; iter <= max_iter; iter++) {
2763
7870
                for (i = 0; i < valid_n; i++) {
2764
7960
                        if (is_binomial) {
2765                                 double varmu = mu[i] * (1.0 - mu[i]);
2766                                 double mu_eta = varmu; // Link derivative for logit
2767
2943
                                 if (varmu < 1e-10) varmu = 1e-10;
2768
2901
                                 Z[i] = eta[i] + (Y[i] - mu[i]) / mu_eta;
2769                                 W[i] = (mu_eta * mu_eta) / varmu;
2770
2922
                        } else {
2771
3045
                                 W[i] = 1.0;
2772                                 Z[i] = Y[i];
2773                        }
2774
3135
                }
2775
2031
                // Formulate XtWX and XtWZ
2776
840
                for (i = 0; i < p; i++) { XtWZ[i] = 0.0; for (size_t j = 0; j < p; j++) XtWX[i * p + j] = 0.0; }
2777
3720
                for (size_t k = 0; k < valid_n; k++) {
2778
8140
                        double w = W[k], z = Z[k];
2779
7300
                        for (i = 0; i < p; i++) {
2780
13960
                                 XtWZ[i] += X[k * p + i] * w * z;
2781                                 double xw = X[k * p + i] * w;
2782                                 for (size_t j = 0; j < p; j++) XtWX[i * p + j] += xw * X[k * p + j];
2783
231
                        }
2784                }
2785
221
                final_rank = sweep_matrix_ols(XtWX, p, aliased);
2786
871
                for (i = 0; i < p; i++) {
2787
880
                        if (aliased[i]) { beta[i] = NAN; } else {
2788
117
                                 double sum = 0.0;
2789
46
                                 for (size_t j = 0; j < p; j++) if (!aliased[j]) sum += XtWX[i * p + j] * XtWZ[j];
2790
14
                                 beta[i] = sum;
2791                        }
2792
799
                }
2793
829
                // Calculate updated ETA, MU, and Deviance (with Step-Halving)
2794                boundary = FALSE;
2795                for (unsigned short int half = 0; half < 10; half++) {
2796                        deviance_new = 0.0;
2797
308
                        for (i = 0; i < valid_n; i++) {
2798
298
                                 double linear_pred = 0.0;
2799
298
                                 for (size_t j = 0; j < p; j++) if (!aliased[j]) linear_pred += X[i * p + j] * beta[j];
2800
1063
                                 eta[i] = linear_pred;
2801
783
                                 if (is_binomial) {
2802                                     mu[i] = 1.0 / (1.0 + exp(-eta[i]));
2803                                     // Boundary enforcement
2804
3021
                                     if (mu[i] < 10 * DBL_EPSILON) mu[i] = 10 * DBL_EPSILON;
2805
28
                                     if (mu[i] > 1.0 - 10 * DBL_EPSILON) mu[i] = 1.0 - 10 * DBL_EPSILON;
2806
2807
1148
                                     double dev = 0.0;
2808
1120
                                     if (Y[i] == 0.0)      dev = -2.0 * log(1.0 - mu[i]);
2809
872
                                     else if (Y[i] == 1.0) dev = -2.0 * log(mu[i]);
2810                                     else dev = 2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i])));
2811
109
                                     deviance_new += dev;
2812
96
                                 } else {
2813
287
                                     mu[i] = eta[i];
2814
248
                                     double res = Y[i] - mu[i];
2815
103
                                     deviance_new += res * res;
2816                                 }
2817
846
                        }
2818
846
                        // Step halving divergence check
2819
841
                        if (!is_binomial || deviance_new <= deviance_old + 1e-7 || !isfinite(deviance_new)) {
2820                                 continue;
2821
22
                        }
2822
2823
28
                        boundary = TRUE;
2824
88
                        for (size_t j = 0; j < p; j++) beta[j] = (beta[j] + beta_old[j]) / 2.0;
2825
67
                }
2826
347
                // Convergence Check
2827                if (fabs(deviance_new - deviance_old) / (0.1 + fabs(deviance_new)) < epsilon) {
2828
340
                        converged = TRUE; break;
2829
340
                }
2830
32
                deviance_old = deviance_new;
2831
32
                for (size_t j = 0; j < p; j++) beta_old[j] = beta[j];
2832
13
        }
2833
0
        // Final accurate calculation of W for standard errors
2834        for (i = 0; i < p; i++) { for (size_t j = 0; j < p; j++) XtWX[i * p + j] = 0.0; }
2835
92
        for (size_t k = 0; k < valid_n; k++) {
2836
340
          double w = is_binomial ? (mu[k] * (1.0 - mu[k])) : 1.0;
2837
340
          if (w < 1e-10) w = 1e-10;
2838          for (i = 0; i < p; i++) {
2839
340
                   double xw = X[k * p + i] * w;
2840
67
                   for (size_t j = 0; j < p; j++) XtWX[i * p + j] += xw * X[k * p + j];
2841
67
          }
2842
87
        }
2843        final_rank = sweep_matrix_ols(XtWX, p, aliased);
2844
80
        // --- Null Deviance Calculation ---
2845        double wtdmu = mean_y; // Since weights are 1.0 initially
2846        for (i = 0; i < valid_n; i++) {
2847
41
          if (is_binomial) {
2848
41
                   if (Y[i] == 0.0)      null_dev += -2.0 * log(1.0 - wtdmu);
2849
41
                   else if (Y[i] == 1.0) null_dev += -2.0 * log(wtdmu);
2850
21
                   else null_dev += 2.0 * (Y[i] * log(Y[i] / wtdmu) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - wtdmu)));
2851
21
          } else {
2852
21
                   double diff = Y[i] - wtdmu;
2853
21
                   null_dev += diff * diff;
2854
41
          }
2855
41
        }
2856
41
        // --- AIC Calculation ---
2857
41
        if (is_gaussian) {
2858
41
          double n_f = (double)valid_n;
2859
41
          aic = n_f * (log(2.0 * M_PI) + 1.0 + log(deviance_new / n_f)) + 2.0 * (final_rank + 1.0);
2860
41
        } else if (is_binomial) {
2861
41
          aic = deviance_new + 2.0 * final_rank;
2862        }
2863        // --- Return Structures ---
2864
88
        res_hv = newHV(); coef_hv = newHV(); fitted_hv = newHV(); resid_hv = newHV();
2865
28
        df_res = valid_n - final_rank;
2866
88
        dispersion = is_binomial ? 1.0 : ((df_res > 0) ? (deviance_new / df_res) : NAN);
2867
28
        for (size_t i = 0; i < valid_n; i++) {
2868
88
                double res = Y[i] - mu[i];
2869
67
                if (is_binomial) {
2870
67
                        // Deviance residuals for binomial
2871                        double d_res = 0.0;
2872
28
                        if (Y[i] == 0.0)      d_res = sqrt(-2.0 * log(1.0 - mu[i]));
2873                        else if (Y[i] == 1.0) d_res = sqrt(-2.0 * log(mu[i]));
2874
28
                        else d_res = sqrt(2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i]))));
2875
28
                        res = (Y[i] > mu[i]) ? d_res : -d_res;
2876
28
                }
2877
28
                hv_store(fitted_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(mu[i]), 0);
2878                hv_store(resid_hv,  valid_row_names[i], strlen(valid_row_names[i]), newSVnv(res), 0);
2879
28
                Safefree(valid_row_names[i]);
2880        }
2881        Safefree(valid_row_names);
2882
2883        summary_hv = newHV(); terms_av = newAV();
2884        for (size_t j = 0; j < p; j++) {
2885                hv_store(coef_hv, exp_terms[j], strlen(exp_terms[j]), newSVnv(beta[j]), 0);
2886                av_push(terms_av, newSVpv(exp_terms[j], 0));
2887
2888
7
                HV *restrict row_hv = newHV();
2889                if (aliased[j]) {
2890
48
                        hv_store(row_hv, "Estimate",   8, newSVpv("NaN", 0), 0);
2891                        hv_store(row_hv, "Std. Error", 10, newSVpv("NaN", 0), 0);
2892
28
                        hv_store(row_hv, is_binomial ? "z value" : "t value", 7, newSVpv("NaN", 0), 0);
2893
48
                        hv_store(row_hv, is_binomial ? "Pr(>|z|)" : "Pr(>|t|)", 8, newSVpv("NaN", 0), 0);
2894
28
                } else {
2895
48
                        double se = sqrt(dispersion * XtWX[j * p + j]);
2896
41
                        double val_stat = beta[j] / se;
2897                        double p_val = is_binomial ? 2.0 * (1.0 - approx_pnorm(fabs(val_stat))) : get_t_pvalue(val_stat, df_res, "two.sided");
2898
2899
125
                        hv_store(row_hv, "Estimate",   8, newSVnv(beta[j]), 0);
2900
91
                        hv_store(row_hv, "Std. Error", 10, newSVnv(se), 0);
2901
91
                        hv_store(row_hv, is_binomial ? "z value" : "t value", 7, newSVnv(val_stat), 0);
2902                        hv_store(row_hv, is_binomial ? "Pr(>|z|)" : "Pr(>|t|)", 8, newSVnv(p_val), 0);
2903
91
                }
2904
70
                hv_store(summary_hv, exp_terms[j], strlen(exp_terms[j]), newRV_noinc((SV*)row_hv), 0);
2905
49
        }
2906
2907
28
        hv_store(res_hv, "aic",            3, newSVnv(aic), 0);
2908
0
        hv_store(res_hv, "coefficients",  12, newRV_noinc((SV*)coef_hv), 0);
2909        hv_store(res_hv, "converged",      9, newSVuv(converged ? 1 : 0), 0);
2910        hv_store(res_hv, "boundary",       8, newSVuv(boundary ? 1 : 0), 0);
2911        hv_store(res_hv, "deviance",       8, newSVnv(deviance_new), 0);
2912        hv_store(res_hv, "deviance.resid", 14, newRV_noinc((SV*)resid_hv), 0);
2913
28
        hv_store(res_hv, "df.null",        7, newSVuv(valid_n - has_intercept), 0);
2914        hv_store(res_hv, "df.residual",   11, newSVuv(df_res), 0);
2915
28
        hv_store(res_hv, "family",         6, newSVpv(family_str, 0), 0);
2916
28
        hv_store(res_hv, "fitted.values", 13, newRV_noinc((SV*)fitted_hv), 0);
2917
28
        hv_store(res_hv, "iter",           4, newSVuv(iter > max_iter ? max_iter : iter), 0);
2918        hv_store(res_hv, "null.deviance", 13, newSVnv(null_dev), 0);
2919        hv_store(res_hv, "rank",           4, newSVuv(final_rank), 0);
2920
28
        hv_store(res_hv, "summary",        7, newRV_noinc((SV*)summary_hv), 0);
2921
28
        hv_store(res_hv, "terms",          5, newRV_noinc((SV*)terms_av), 0);
2922
2923        // --- Cleanup ---
2924        for (i = 0; i < num_terms; i++) Safefree(terms[i]);
2925
49
        Safefree(terms);
2926
49
        for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]);
2927        Safefree(uniq_terms);
2928
49
        for (size_t j = 0; j < p_exp; j++) {
2929
42
                Safefree(exp_terms[j]);
2930                if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
2931
35
        }
2932
35
        Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
2933
2934
28
        Safefree(mu); Safefree(eta); Safefree(Z); Safefree(W);
2935
138
        Safefree(beta); Safefree(beta_old); Safefree(aliased);
2936
124
        Safefree(XtWX); Safefree(XtWZ); Safefree(X); Safefree(Y);
2937
124
        if (row_hashes) Safefree(row_hashes);
2938
2939
124
        RETVAL = newRV_noinc((SV*)res_hv);
2940
124
}
2941OUTPUT:
2942    RETVAL
2943
2944
112
SV* cor_test(...)
2945
105
CODE:
2946
112
{
2947        if (items < 2 || items % 2 != 0)
2948                croak("Usage: cor_test(\\@x, \\@y, method => 'pearson', ...)");
2949
2950
28
        SV *restrict x_ref = ST(0), *restrict y_ref = ST(1);
2951
2952
7
        const char *restrict alternative = "two.sided";
2953
7
        const char *restrict method = "pearson";
2954        SV *restrict exact_sv = NULL;
2955        double conf_level = 0.95;
2956
28
        bool continuity = 0;
2957
2958
19
        /* Parse named arguments from the flat stack starting at index 2 */
2959
118
        for (unsigned short int i = 2; i < items; i += 2) {
2960
99
          const char *restrict key = SvPV_nolen(ST(i));
2961
99
          SV *restrict val = ST(i + 1);
2962
2963
99
          if      (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
2964
99
          else if (strEQ(key, "method"))      method = SvPV_nolen(val);
2965
95
          else if (strEQ(key, "exact"))       exact_sv = val;
2966
95
          else if (strEQ(key, "conf.level") || strEQ(key, "conf_level")) conf_level = SvNV(val);
2967          else if (strEQ(key, "continuity"))  continuity = SvTRUE(val);
2968
47
          else croak("cor_test: unknown argument '%s'", key);
2969
19
        }
2970
2971        AV *restrict x_av, *restrict y_av;
2972        double *restrict x, *restrict y;
2973
12
        double estimate = 0, p_value = 0, statistic = 0, df = 0, ci_lower = 0, ci_upper = 0;
2974
2975
19
        bool is_pearson  = (strcmp(method, "pearson") == 0);
2976
16
        bool is_kendall  = (strcmp(method, "kendall") == 0);
2977
36
        bool is_spearman = (strcmp(method, "spearman") == 0);
2978
32
        HV *restrict rhv;
2979
2980        if (!SvOK(x_ref) || !SvROK(x_ref) || SvTYPE(SvRV(x_ref)) != SVt_PVAV ||
2981
32
            !SvOK(y_ref) || !SvROK(y_ref) || SvTYPE(SvRV(y_ref)) != SVt_PVAV) {
2982
29
          croak("cor_test: x and y must be array references");
2983
26
        }
2984
2985
104
        x_av = (AV*)SvRV(x_ref);
2986
80
        y_av = (AV*)SvRV(y_ref);
2987
2988        size_t n_raw = av_len(x_av) + 1;
2989
64
        if (n_raw != av_len(y_av) + 1) croak("incompatible dimensions");
2990
2991
64
        x = safemalloc(n_raw * sizeof(double));
2992
64
        y = safemalloc(n_raw * sizeof(double));
2993
2994        size_t n = 0; /* Final count of pairwise complete observations */
2995        for (size_t i = 0; i < n_raw; i++) {
2996
10
          SV **restrict x_val = av_fetch(x_av, i, 0);
2997
10
          SV **restrict y_val = av_fetch(y_av, i, 0);
2998
2999
10
          double xv = (x_val && SvOK(*x_val) && looks_like_number(*x_val)) ? SvNV(*x_val) : NAN;
3000          double yv = (y_val && SvOK(*y_val) && looks_like_number(*y_val)) ? SvNV(*y_val) : NAN;
3001
3002          /* Pairwise complete observations (skips NAs seamlessly like R) */
3003
10
          if (!isnan(xv) && !isnan(yv)) {
3004
9
              x[n] = xv;
3005              y[n] = yv;
3006
2
              n++;
3007          }
3008        }
3009
3010        if (n < 3) {
3011
34
          Safefree(x);
3012
26
          Safefree(y);
3013
26
          croak("not enough finite observations");
3014
26
        }
3015
3016        if (is_pearson) {
3017
20
          // Welford's Method for Pearson Correlation
3018
20
          double mean_x = 0.0, mean_y = 0.0, M2_x = 0.0, M2_y = 0.0, cov = 0.0;
3019
20
          for (size_t i = 0; i < n; i++) {
3020
4
                   double dx = x[i] - mean_x;
3021                   mean_x += dx / (i + 1);
3022
2
                   double dy = y[i] - mean_y;
3023
2
                   mean_y += dy / (i + 1);
3024
2
                   M2_x += dx * (x[i] - mean_x);
3025
2
                   M2_y += dy * (y[i] - mean_y);
3026                   cov  += dx * (y[i] - mean_y);
3027
2
          }
3028          estimate = (M2_x > 0.0 && M2_y > 0.0) ? cov / sqrt(M2_x * M2_y) : 0.0;
3029          df = n - 2;
3030
3
          statistic = estimate * sqrt(df / (1.0 - estimate * estimate));
3031
3032
5
          // Confidence interval using Fisher's Z transform
3033
5
          double z = 0.5 * log((1.0 + estimate) / (1.0 - estimate));
3034
5
          double se = 1.0 / sqrt(n - 3);
3035          double alpha = 1.0 - conf_level;
3036          double q = inverse_normal_cdf(1.0 - alpha/2.0);
3037
5
          ci_lower = tanh(z - q * se);
3038
18
          ci_upper = tanh(z + q * se);
3039
3040
15
          // HIGH-PRECISION P-VALUE USING INCOMPLETE BETA
3041
15
          p_value = get_t_pvalue(statistic, df, alternative);
3042
15
        } else if (is_kendall) {
3043
15
          int c = 0, d = 0, tie_x = 0, tie_y = 0;
3044
15
          for (size_t i = 0; i < n - 1; i++) {
3045
15
                   for (size_t j = i + 1; j < n; j++) {
3046                       double sign_x = (x[i] - x[j] > 0) - (x[i] - x[j] < 0);
3047
3
                       double sign_y = (y[i] - y[j] > 0) - (y[i] - y[j] < 0);
3048
3049                       if (sign_x == 0 && sign_y == 0) { /* Joint tie, ignore */ }
3050
4
                       else if (sign_x == 0) tie_x++;
3051
19
                       else if (sign_y == 0) tie_y++;
3052
16
                       else if (sign_x * sign_y > 0) c++;
3053
16
                       else d++;
3054                   }
3055          }
3056          double denom = sqrt((double)(c + d + tie_x) * (double)(c + d + tie_y));
3057
4
          estimate = (denom == 0.0) ? (0.0/0.0) : (double)(c - d) / denom;
3058
3059
21
          bool has_ties = (tie_x > 0 || tie_y > 0);
3060
5
          bool do_exact;
3061
3062          /* Mirror R: exact defaults to TRUE if N < 50 and NO ties */
3063          if (!exact_sv || !SvOK(exact_sv)) {
3064
8
                   do_exact = (n < 50) && !has_ties;
3065
8
          } else {
3066                   do_exact = SvTRUE(exact_sv) ? 1 : 0;
3067
5
          }
3068          // If forced exact but ties exist, R overrides and falls back to approximation anyway
3069          if (do_exact && has_ties) do_exact = 0;
3070
3071
8
          if (do_exact) {
3072
4
                   double S_stat = c - d;
3073                   statistic = c;
3074
1
                   p_value = kendall_exact_pvalue(n, S_stat, alternative);
3075
6
          } else {
3076
5
                   // Normal approximation for large N or ties
3077
5
                   double var_S = n * (n - 1) * (2.0 * n + 5.0) / 18.0;
3078
1
                   double S = c - d;
3079                   if (continuity) S -= (S > 0 ? 1 : -1);
3080
9
                   statistic = S / sqrt(var_S);
3081
3082
5
                   if (strcmp(alternative, "two.sided") == 0) {
3083
0
                       p_value = 2.0 * (1.0 - approx_pnorm(fabs(statistic)));
3084                   } else if (strcmp(alternative, "less") == 0) {
3085
21
                       p_value = approx_pnorm(statistic);
3086
22
                   } else {
3087
22
                       p_value = 1.0 - approx_pnorm(statistic);
3088
21
                   }
3089
22
          }
3090
22
        } else if (is_spearman) {
3091
22
          double *restrict rank_x = safemalloc(n * sizeof(double));
3092
21
          double *restrict rank_y = safemalloc(n * sizeof(double));
3093
12
          compute_ranks(x, rank_x, n);
3094
12
          compute_ranks(y, rank_y, n);
3095
3096
12
          // Spearman rho = Pearson r of the ranks (Welford's Method)
3097
13
          double mean_x = 0.0, mean_y = 0.0, M2_x = 0.0, M2_y = 0.0, cov = 0.0;
3098          for (size_t i = 0; i < n; i++) {
3099                   double dx = rank_x[i] - mean_x;
3100
21
                   mean_x += dx / (i + 1);
3101                   double dy = rank_y[i] - mean_y;
3102                   mean_y += dy / (i + 1);
3103                   M2_x += dx * (rank_x[i] - mean_x);
3104                   M2_y += dy * (rank_y[i] - mean_y);
3105                   cov  += dx * (rank_y[i] - mean_y);
3106          }
3107          estimate = (M2_x > 0.0 && M2_y > 0.0) ? cov / sqrt(M2_x * M2_y) : 0.0;
3108
3109          // S = sum of squared rank differences (R's reported statistic)
3110          double S_stat = 0.0;
3111
6
          for (size_t i = 0; i < n; i++) {
3112
13
                   double diff = rank_x[i] - rank_y[i];
3113                   S_stat += diff * diff;
3114
13
          }
3115
3116          // Ties produce fractional (averaged) ranks — detect them
3117          bool has_ties = 0, do_exact;
3118
13
          for (size_t i = 0; i < n; i++) {
3119
13
                   if (rank_x[i] != floor(rank_x[i]) || rank_y[i] != floor(rank_y[i])) {
3120                       has_ties = 1;
3121
13
                       break;
3122                   }
3123          }
3124
85
          if (!exact_sv || !SvOK(exact_sv)) {
3125
79
                   do_exact = (n < 10) && !has_ties;
3126
76
          } else {
3127
76
                   do_exact = SvTRUE(exact_sv) ? 1 : 0;
3128
76
          }
3129
3130
76
          if (do_exact) {
3131
79
                   statistic = S_stat;
3132                   p_value   = spearman_exact_pvalue(S_stat, n, alternative);
3133          } else {
3134                   double r = estimate;
3135                   if (continuity)
3136
8
                       r *= (1.0 - 1.0 / (2.0 * (n - 1)));
3137
2
                   statistic = r * sqrt((n - 2.0) / (1.0 - r * r));
3138
2
                   p_value = get_t_pvalue(statistic, (double)(n - 2), alternative);
3139          }
3140          Safefree(rank_x); Safefree(rank_y);
3141
6
        } else {
3142          Safefree(x); Safefree(y);
3143
80
          croak("Unknown method");
3144
74
        }
3145        Safefree(x); Safefree(y);
3146
8
        rhv = newHV();
3147
26
        hv_stores(rhv, "estimate", newSVnv(estimate));
3148
24
        hv_stores(rhv, "p.value", newSVnv(p_value));
3149        hv_stores(rhv, "statistic", newSVnv(statistic));
3150
30
        hv_stores(rhv, "method", newSVpv(method, 0));
3151        hv_stores(rhv, "alternative", newSVpv(alternative, 0));
3152        if (is_pearson) {
3153
30
          hv_stores(rhv, "parameter", newSVnv(df));
3154
24
          AV *restrict ci_av = newAV();
3155
24
          av_push(ci_av, newSVnv(ci_lower));
3156
24
          av_push(ci_av, newSVnv(ci_upper));
3157
24
          hv_stores(rhv, "conf.int", newRV_noinc((SV*)ci_av));
3158        }
3159
3160        RETVAL = newRV_noinc((SV*)rhv);
3161}
3162
6
OUTPUT:
3163
6
    RETVAL
3164
3165
104
void shapiro_test(data)
3166
96
        SV *data
3167
74
PREINIT:
3168        AV *restrict av;
3169
6
        HV *restrict ret_hash;
3170
6
        size_t n_raw, n = 0;
3171
8
        double *restrict x, w = 0.0, p_val = 0.0, mean = 0.0, ssq = 0.0;
3172
8
PPCODE:
3173
9
        if (!SvROK(data) || SvTYPE(SvRV(data)) != SVt_PVAV) {
3174
3
          croak("Expected an array reference");
3175
12
        }
3176
3177        av = (AV *)SvRV(data);
3178        n_raw = av_len(av) + 1;
3179
3180
5
        Newx(x, n_raw, double);
3181
3182
5
        // Extract variables and calculate mean (skipping undefined/NaN values)
3183
74
        for (size_t i = 0; i < n_raw; i++) {
3184
69
          SV **restrict elem = av_fetch(av, i, 0);
3185          if (elem && SvOK(*elem)) {
3186                   double val = SvNV(*elem);
3187
102
                   if (!isnan(val)) {
3188
74
                       x[n] = val;
3189                       mean += val;
3190
8
                       n++;
3191                   }
3192          }
3193        }
3194
3195
8
        if (n < 3 || n > 5000) {
3196          Safefree(x);
3197
8
          croak("Sample size must be between 3 and 5000 (R's limit)");
3198        }
3199
3200        mean /= n;
3201
6
        // Calculate Sum of Squares */
3202
4
        for (size_t i = 0; i < n; i++) {
3203
7
          ssq += (x[i] - mean) * (x[i] - mean);
3204
3
        }
3205        if (ssq == 0.0) {
3206          Safefree(x);
3207
4
          croak("Data is perfectly constant; cannot compute Shapiro-Wilk test");
3208
4
        }
3209
4
        qsort(x, n, sizeof(double), compare_doubles);
3210
3211        // --- Core AS R94 Algorithm: Weights and Statistic W ---
3212        if (n == 3) {
3213
19
          double a_val = 0.7071067811865475; /* sqrt(1/2) */
3214          double b_val = a_val * (x[2] - x[0]);
3215          w = (b_val * b_val) / ssq;
3216          if (w < 0.75) w = 0.75;
3217
18
          // Exact P-value for n=3
3218          p_val = 1.90985931710274 * (asin(sqrt(w)) - 1.04719755119660);
3219
29
        } else {
3220
27
          double *restrict m, *restrict a;
3221
5
          double sum_m2 = 0.0, b_val = 0.0;
3222
5
          Newx(m, n, double);
3223
5
          Newx(a, n, double);
3224          for (size_t i = 0; i < n; i++) {
3225                   m[i] = inverse_normal_cdf((i + 1.0 - 0.375) / (n + 0.25));
3226
7
                   sum_m2 += m[i] * m[i];
3227
7
          }
3228          double u = 1.0 / sqrt((double)n);
3229
7
          double a_n = -2.706056*pow(u,5) + 4.434685*pow(u,4) - 2.071190*pow(u,3) - 0.147981*pow(u,2) + 0.221157*u + m[n-1]/sqrt(sum_m2);
3230          a[n-1] = a_n;
3231
6
          a[0]   = -a_n;
3232
7
          if (n == 4 || n == 5) {
3233
7
                   double eps = (sum_m2 - 2.0 * m[n-1]*m[n-1]) / (1.0 - 2.0 * a_n*a_n);
3234
7
                   for (unsigned int i = 1; i < n-1; i++) {
3235
7
                       a[i] = m[i] / sqrt(eps);
3236
7
                   }
3237
7
          } else {
3238
7
                   double a_n1 = -3.582633*pow(u,5) + 5.682633*pow(u,4) - 1.752461*pow(u,3) - 0.293762*pow(u,2) + 0.042981*u + m[n-2]/sqrt(sum_m2);
3239                   a[n-2] = a_n1;
3240                   a[1]   = -a_n1;
3241                   double eps = (sum_m2 - 2.0 * m[n-1]*m[n-1] - 2.0 * m[n-2]*m[n-2]) / (1.0 - 2.0 * a_n*a_n - 2.0 * a_n1*a_n1);
3242                   for (unsigned int i = 2; i < n-2; i++) {
3243
31
                       a[i] = m[i] / sqrt(eps);
3244
31
                   }
3245
31
          }
3246          for (size_t i = 0; i < n; i++) {
3247
30103
                   b_val += a[i] * x[i];
3248
30074
          }
3249
30083
          w = (b_val * b_val) / ssq;
3250
11
        // --- AS R94 P-Value Calculation: High Precision Refinement ---
3251
11
          /* NOTE: p_val is declared in PREINIT above;
3252
644
                * do NOT shadow it with a local 'double p_val' here or the result will never reach the caller.
3253
635
                */
3254
635
          double y = log(1.0 - w);
3255
635
          double z;
3256
635
          if (n <= 11) {
3257
11
                   // Royston's branch for 4 <= n <= 11 (AS R94, small-sample path).
3258
11
                   // gamma is the upper bound on y = log(1-W);
3259                   // if y reaches gamma the p-value is essentially zero
3260
645
                   double nn = (double)n;
3261                   double gamma = 0.459 * nn - 2.273;
3262                   if (y >= gamma) {
3263
30075
                       p_val = 1e-19;
3264
30075
                   } else {
3265
40101
                       // Horner-form polynomials in n for mu and log(sigma)
3266
10076
                       double mu     = 0.544  + nn * (-0.39978  + nn * ( 0.025054  - nn * 0.0006714));
3267
10079
                       double sig_val= 1.3822 + nn * (-0.77857  + nn * ( 0.062767  - nn * 0.0020322));
3268                       double sigma  = exp(sig_val);
3269
30067
                       z = (-log(gamma - y) - mu) / sigma;
3270                       /* Upper-tail probability P(Z > z): small W → large z → small p-value.
3271                       */
3272
34
                       p_val = 0.5 * erfc(z * M_SQRT1_2);
3273
243
                   }
3274          } else {
3275                   // Royston's branch for n >= 12 (AS R94, large-sample path)
3276                   double ln_n   = log((double)n);
3277                   // Horner-form polynomials in log(n) for mu and log(sigma). */
3278                   double mu     = -1.5861 + ln_n * (-0.31082 + ln_n * (-0.083751 + ln_n * 0.0038915));
3279                   double sig_val= -0.4803 + ln_n * (-0.082676 + ln_n * 0.0030302);
3280
246
                   double sigma  = exp(sig_val);
3281
246
                   z = (y - mu) / sigma;
3282
245
                   p_val = 0.5 * erfc(z * M_SQRT1_2);
3283          }
3284
30317
          // Clamp the p-value
3285
30075
          if (p_val > 1.0) p_val = 1.0;
3286
30087
          if (p_val < 0.0) p_val = 0.0;
3287
3288
13
          Safefree(m); m = NULL;  Safefree(a); a = NULL;
3289
10969
        }
3290
10956
        Safefree(x); x = NULL;
3291
10956
        ret_hash = newHV();
3292
957
        hv_stores(ret_hash, "statistic", newSVnv(w));
3293
957
        hv_stores(ret_hash, "W",         newSVnv(w));
3294
10065
        hv_stores(ret_hash, "p_value",   newSVnv(p_val));
3295
43
        hv_stores(ret_hash, "p.value",   newSVnv(p_val));
3296        EXTEND(SP, 1);
3297
943
        PUSHs(sv_2mortal(newRV_noinc((SV *)ret_hash)));
3298
3299double min(...)
3300
30069
        PROTOTYPE: @
3301
30073
        INIT:
3302
30073
                double min_val = 0.0;
3303
102
                size_t count = 0;
3304
10128
                bool first = TRUE;
3305        CODE:
3306
40088
                for (unsigned short int i = 0; i < items; i++) {
3307                        SV* restrict arg = ST(i);
3308                        if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3309
10065
                                AV* restrict av = (AV*)SvRV(arg);
3310
35
                                size_t len = av_len(av) + 1;
3311                                for (size_t j = 0; j < len; j++) {
3312                                     SV** restrict tv = av_fetch(av, j, 0);
3313                                     if (tv && SvOK(*tv)) {
3314                                         double val = SvNV(*tv);
3315                                         if (first || val < min_val) {
3316                                             min_val = val;
3317
17
                                             first = FALSE;
3318
329
                                         }
3319                                         count++;
3320                                     } else {
3321
325
                                         croak("min: undefined value at array ref index %zu (argument %d)", j, (int)i);
3322                                     }
3323
325
                                 }
3324                        } else if (SvOK(arg)) {
3325
324
                                 double val = SvNV(arg);
3326
312
                                 if (first || val < min_val) {
3327                                     min_val = val;
3328                                     first = FALSE;
3329
57
                                 }
3330                                 count++;
3331
45
                        } else {
3332
330
                                 croak("min: undefined value at argument index %d", (int)i);
3333
19
                        }
3334
10029
                }
3335
10028
                if (count == 0) croak("min needs >= 1 numeric element");
3336
10028
                RETVAL = min_val;
3337
33
        OUTPUT:
3338
39
          RETVAL
3339
3340
7
double max(...)
3341
17
        PROTOTYPE: @
3342
16
        INIT:
3343
10
                double max_val = 0.0;
3344
10
                size_t count = 0;
3345
10
                bool first = TRUE;
3346
10
        CODE:
3347
10
                for (size_t i = 0; i < items; i++) {
3348                   SV* restrict arg = ST(i);
3349                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3350                       AV* restrict av = (AV*)SvRV(arg);
3351                       size_t len = av_len(av) + 1;
3352
12
                       for (size_t j = 0; j < len; j++) {
3353
20
                           SV** restrict tv = av_fetch(av, j, 0);
3354
16
                           if (tv && SvOK(*tv)) {
3355
12
                               double val = SvNV(*tv);
3356
9
                               if (first || val > max_val) {
3357
5
                                   max_val = val;
3358
5
                                   first = FALSE;
3359
5
                               }
3360
5
                               count++;
3361                           } else {
3362
4
                               croak("max: undefined value at array ref index %zu (argument %zu)", j, i);
3363                           }
3364
14
                       }
3365                   } else if (SvOK(arg)) {
3366
14
                       double val = SvNV(arg);
3367
2
                       if (first || val > max_val) {
3368                           max_val = val;
3369                           first = FALSE;
3370
14
                       }
3371
14
                       count++;
3372
14
                   } else {
3373
14
                       croak("max: undefined value at argument index %zu", i);
3374                   }
3375
14
          }
3376
60062
          if (count == 0) croak("max needs >= 1 numeric element");
3377          RETVAL = max_val;
3378
60052
        OUTPUT:
3379
2
                RETVAL
3380
3381
60050
SV* runif(...)
3382CODE:
3383
60050
{
3384        size_t n = 0;
3385
13
        double min = 0.0, max = 1.0;
3386
3387        // Flags to track what has been assigned
3388        bool n_set = 0, min_set = 0, max_set = 0;
3389
3390        unsigned int i = 0;
3391
3392        if (items == 0) {
3393          croak("Usage: runif(n, [min=0], [max=1]) or runif(n => $n, ...)");
3394
37
        }
3395
3396
4
        while (i < items) {
3397                // 1. Check if the current argument is a string key for a named parameter
3398
34
                if (i + 1 < items && SvPOK(ST(i))) {
3399
33
                        char *restrict key = SvPV_nolen(ST(i));
3400                        if (strEQ(key, "n")) {
3401
37
                                n = (size_t)SvUV(ST(i+1));
3402                                n_set = 1;
3403
130
                                i += 2;
3404
93
                                continue;
3405
97
                        } else if (strEQ(key, "min")) {
3406                                min = SvNV(ST(i+1));
3407
97
                                min_set = 1;
3408
64
                                i += 2;
3409
34
                                continue;
3410
4
                        } else if (strEQ(key, "max")) {
3411                                max = SvNV(ST(i+1));
3412                                max_set = 1;
3413                                i += 2;
3414
20053
                                continue;
3415
20043
                        }
3416                }
3417
3418
20037
                // 2. Fallback to positional parsing if it's not a recognized key
3419
20037
                if (!n_set) {
3420
61522
                        n = (size_t)SvUV(ST(i));
3421
61509
                        n_set = 1;
3422                } else if (!min_set) {
3423                        min = SvNV(ST(i));
3424                        min_set = 1;
3425
33
                } else if (!max_set) {
3426                        max = SvNV(ST(i));
3427                        max_set = 1;
3428                } else {
3429                        croak("Too many arguments or unrecognized parameter passed to runif()");
3430                }
3431                i++;
3432        }
3433        if (!n_set) {
3434                croak("runif() requires at least the 'n' parameter");
3435
28
        }
3436
17
        // Ensure PRNG is seeded
3437        AUTO_SEED_PRNG();
3438
32
        AV *restrict results = newAV();
3439
32
        if (n > 0) {
3440
63
                av_extend(results, n - 1);
3441        }
3442        const double range = max - min;
3443        for (size_t j = 0; j < n; j++) {
3444
49
                double r;
3445
49
                if (max < min) {
3446
49
                        r = NAN; // R behavior for inverted ranges
3447                } else {
3448
6098
                        r = min + range * Drand01();
3449
6073
                }
3450
6063
                av_push(results, newSVnv(r));
3451
6074
        }
3452
6069
        RETVAL = newRV_noinc((SV*)results);
3453
6067
}
3454
6067
OUTPUT:
3455    RETVAL
3456
3457
22
SV* rbinom(...)
3458
20506
        CODE:
3459
20499
        {
3460          // Auto-seed the PRNG if the Perl script hasn't done so yet
3461          AUTO_SEED_PRNG();
3462
22
          if (items % 2 != 0)
3463                   croak("Usage: rbinom(n => 10, size => 100, prob => 0.5)");
3464
24
          //Parse named arguments
3465          size_t n = 0, size = 0;
3466
2
          double prob = 0.5;
3467
3468          bool size_set = FALSE, prob_set = FALSE;
3469
3470          for (unsigned short i = 0; i < items; i += 2) {
3471
22
                   const char* restrict key = SvPV_nolen(ST(i));
3472
21
                   SV* restrict val = ST(i + 1);
3473
3474                   if      (strEQ(key, "n"))      n    = (unsigned int)SvUV(val);
3475                   else if (strEQ(key, "size")) { size = (unsigned int)SvUV(val); size_set = TRUE; }
3476                   else if (strEQ(key, "prob")) { prob = SvNV(val); prob_set = TRUE; }
3477
21
                   else croak("rbinom: unknown argument '%s'", key);
3478
2026
          }
3479
3480          // R requires size and prob to be explicitly passed in rbinom
3481
2036
          if (!size_set || !prob_set) croak("rbinom: 'size' and 'prob' are required arguments");
3482          if (prob < 0.0 || prob > 1.0) croak("rbinom: prob must be between 0 and 1");
3483
3484          AV *restrict result_av = newAV();
3485
2036
          if (n > 0) {
3486
2036
                   av_extend(result_av, n - 1);
3487
2035
                   for (unsigned int i = 0; i < n; i++) {
3488
2035
                       av_store(result_av, i, newSVuv(generate_binomial(size, prob)));
3489                   }
3490          }
3491
3492
89
          RETVAL = newRV_noinc((SV*)result_av);
3493
69
        }
3494        OUTPUT:
3495                RETVAL
3496
3497
15
SV*
3498hist(SV* x_sv, ...)
3499        CODE:
3500
20
        {
3501
20
                // 1. Validate Input
3502
15
                if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
3503
20
                        croak("hist: first argument must be an array reference");
3504
3505
89
                AV*restrict x_av = (AV*)SvRV(x_sv);
3506
74
                size_t n_raw = av_len(x_av) + 1;
3507
74
                if (n_raw == 0) croak("hist: input array is empty");
3508
3509
54
                // 2. Extract Data & Find Range
3510
59
                double *restrict x;
3511                Newx(x, n_raw, double);
3512                size_t n = 0;
3513
20
                double min_val = DBL_MAX, max_val = -DBL_MAX;
3514
3515
20
                for (size_t i = 0; i < n_raw; i++) {
3516
20
                        SV**restrict tv = av_fetch(x_av, i, 0);
3517                        if (tv && SvOK(*tv)) {
3518                                 double val = SvNV(*tv);
3519
20
                                 x[n++] = val;
3520
43
                                 if (val < min_val) min_val = val;
3521                                 if (val > max_val) max_val = val;
3522
38
                        }
3523                }
3524                if (n == 0) {
3525                        Safefree(x);
3526                        croak("hist: input contains no valid numeric data");
3527                }
3528                // 3. Determine Bin Count (Sturges default or user-provided)
3529                size_t n_bins = 0;
3530
3531
17
                if (items == 2) {
3532
17
                        // Support pure positional argument: hist($data, 22)
3533                        n_bins = (size_t)SvIV(ST(1));
3534                } else if (items > 2) {
3535
17
                        /* Support named parameters even if mixed with positional arguments */
3536
14
                        for (unsigned short i = 1; i < items - 1; i++) {
3537
14
                                 /* Make sure the SV holds a string before doing string comparison */
3538                                 if (SvPOK(ST(i)) && strEQ(SvPV_nolen(ST(i)), "breaks")) {
3539                                     n_bins = (size_t)SvIV(ST(i+1));
3540                                     break;
3541
40
                                 }
3542
23
                        }
3543                        /* Fallback: if 'breaks' wasn't found but a positional number was given first */
3544
50
                        if (n_bins == 0 && looks_like_number(ST(1))) {
3545
33
                                 n_bins = (size_t)SvIV(ST(1));
3546
33
                        }
3547                }
3548
33
                if (n_bins == 0) n_bins = calculate_sturges_bins(n);
3549
17
                // 4. Allocate Result Arrays
3550
5
                double *restrict breaks, *restrict mids, *restrict density;
3551                size_t *restrict counts;
3552
17
                Newx(breaks,  n_bins + 1, double);
3553
5
                Newx(mids,    n_bins,     double);
3554
17
                Newx(density, n_bins,     double);
3555
17
                Newx(counts,  n_bins,     size_t);
3556
3557                // Generate simple linear breaks
3558                double step = (max_val - min_val) / (double)n_bins;
3559                for (size_t i = 0; i <= n_bins; i++) {
3560
16
                        breaks[i] = min_val + (double)i * step;
3561
16
                }
3562
3563
616
                // 5. Compute Statistics
3564
615
                compute_hist_logic(x, n, breaks, n_bins, counts, mids, density);
3565
3566                // 6. Build Return HashRef
3567                HV*restrict res_hv = newHV();
3568
16
                AV*restrict av_breaks  = newAV();
3569
0
                AV*restrict av_counts  = newAV();
3570
9
                AV*restrict av_mids    = newAV();
3571                AV*restrict av_density = newAV();
3572                for (size_t i = 0; i <= n_bins; i++) {
3573
17
                        av_push(av_breaks, newSVnv(breaks[i]));
3574                        if (i < n_bins) {
3575
17
                                 av_push(av_counts,  newSViv(counts[i]));
3576
17
                                 av_push(av_mids,    newSVnv(mids[i]));
3577                                 av_push(av_density, newSVnv(density[i]));
3578                        }
3579
28
                }
3580
12
                hv_stores(res_hv, "breaks",  newRV_noinc((SV*)av_breaks));
3581
16
                hv_stores(res_hv, "counts",  newRV_noinc((SV*)av_counts));
3582
12
                hv_stores(res_hv, "mids",    newRV_noinc((SV*)av_mids));
3583
43
                hv_stores(res_hv, "density", newRV_noinc((SV*)av_density));
3584
3585
31
                // Clean
3586
31
                Safefree(x); Safefree(breaks); Safefree(mids);
3587
4
                Safefree(density); Safefree(counts);
3588
3589                RETVAL = newRV_noinc((SV*)res_hv);
3590        }
3591        OUTPUT:
3592
204
          RETVAL
3593
3594SV* quantile(...)
3595        CODE:
3596        {
3597
216
                SV *restrict x_sv = NULL;
3598                SV *restrict probs_sv = NULL;
3599
43
                int arg_idx = 0;
3600
3601                /* --- 1. Consume first positional arg as 'x' if it's an array ref --- */
3602
27
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
3603
7
                         x_sv = ST(arg_idx);
3604
28
                         arg_idx++;
3605
7
                }
3606
3607
7
                /* --- 2. Remaining args must be key-value pairs --- */
3608                if ((items - arg_idx) % 2 != 0)
3609                         croak("Usage: quantile(\\@data, probs => \\@probs)  OR  quantile(x => \\@data, probs => \\@probs)");
3610
3611
22
                for (; arg_idx < items; arg_idx += 2) {
3612
31
                         const char *restrict key = SvPV_nolen(ST(arg_idx));
3613
27
                         SV *restrict val = ST(arg_idx + 1);
3614
3615                         if      (strEQ(key, "x"))     x_sv     = val;
3616                         else if (strEQ(key, "probs")) probs_sv = val;
3617                         else croak("quantile: unknown argument '%s'", key);
3618
36
                }
3619                if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
3620
36
                        croak("quantile: 'x' must be an array reference");
3621
27
                AV *restrict x_av = (AV*)SvRV(x_sv);
3622                size_t n_raw = av_len(x_av) + 1;
3623
0
                if (n_raw == 0) croak("quantile: 'x' is empty");
3624
3625                /* --- Extract valid numeric data & drop NAs --- */
3626
27
                double *restrict x;
3627                Newx(x, n_raw, double);
3628                size_t n = 0;
3629
12
                for (size_t i = 0; i < n_raw; i++) {
3630
16
                        SV **restrict tv = av_fetch(x_av, i, 0);
3631                        if (tv && SvOK(*tv)) {
3632
25
                                 x[n++] = SvNV(*tv);
3633                        }
3634                }
3635                if (n == 0) {
3636                        Safefree(x);
3637                        croak("quantile: 'x' contains no valid numbers");
3638                }
3639                // --- Sort Data for Quantile Math ---
3640                qsort(x, n, sizeof(double), compare_doubles);
3641
126
                // --- Parse Probabilities (Default matches R's c(0, .25, .5, .75, 1)) ---
3642
126
                double default_probs[] = {0.0, 0.25, 0.50, 0.75, 1.0};
3643                unsigned int n_probs = 5;
3644
268
                double *restrict probs;
3645
3646
262
                if (probs_sv && SvROK(probs_sv) && SvTYPE(SvRV(probs_sv)) == SVt_PVAV) {
3647
118
                        AV *restrict p_av = (AV*)SvRV(probs_sv);
3648
112
                        n_probs = av_len(p_av) + 1;
3649
60921
                        Newx(probs, n_probs, double);
3650
60810
                        for (unsigned int i = 0; i < n_probs; i++) {
3651
60810
                                 SV **tv = av_fetch(p_av, i, 0);
3652                                 probs[i] = (tv && SvOK(*tv)) ? SvNV(*tv) : 0.0;
3653
45
                                 if (probs[i] < 0.0 || probs[i] > 1.0) {
3654
48
                                     Safefree(x); Safefree(probs);
3655                                     croak("quantile: probabilities must be between 0 and 1");
3656                                 }
3657
126
                        }
3658
123
                } else {
3659                        Newx(probs, n_probs, double);
3660                        for (unsigned int i = 0; i < n_probs; i++) probs[i] = default_probs[i];
3661                }
3662
3663                /* --- Calculate Quantiles (R Type 7 Algorithm) --- */
3664                HV *restrict res_hv = newHV();
3665
3666
24
                for (size_t i = 0; i < n_probs; i++) {
3667                        double p = probs[i], q = 0.0;
3668
3669
70
                        if (n == 1) {
3670
55
                                 q = x[0];
3671
58
                        } else if (p == 1.0) {
3672
44
                                 q = x[n - 1]; /* Prevent out-of-bounds mapping */
3673
44
                        } else if (p == 0.0) {
3674
30093
                                 q = x[0];
3675
30051
                        } else {
3676
30088
                                 /* Continuous sample quantile interpolation (Type 7) */
3677
30035
                                 double h = (n - 1) * p;
3678
30035
                                 unsigned int j = (unsigned int)h; /* floor via cast */
3679
50304
                                 double gamma = h - j;
3680
50267
                                 q = (1.0 - gamma) * x[j] + gamma * x[j + 1];
3681
50267
                        }
3682
3683                        /* Format hash key to exactly match R's naming convention ("25%", "33.3%") */
3684
20317
                        char key[32];
3685
20317
                        double pct = p * 100.0;
3686
3687
64
                        if (pct == (unsigned int)pct) {
3688
63
                                 snprintf(key, sizeof(key), "%.0f%%", pct);
3689
63
                        } else {
3690                                 snprintf(key, sizeof(key), "%.1f%%", pct);
3691                        }
3692
3693
51
                        hv_store(res_hv, key, strlen(key), newSVnv(q), 0);
3694                }
3695
3696                Safefree(x);
3697                Safefree(probs);
3698
3699                RETVAL = newRV_noinc((SV*)res_hv);
3700
56
        }
3701
23
        OUTPUT:
3702          RETVAL
3703
3704
3705
52
double mean(...)
3706
58
        PROTOTYPE: @
3707
26
        INIT:
3708
11
          double total = 0;
3709
30041
          size_t count = 0;
3710
30041
        CODE:
3711
30040
                for (size_t i = 0; i < items; i++) {
3712
30040
                        SV* restrict arg = ST(i);
3713
30039
                        if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3714
30039
                                AV* restrict av = (AV*)SvRV(arg);
3715
30031
                                size_t len = av_len(av) + 1;
3716
30044
                                for (size_t j = 0; j < len; j++) {
3717                                     SV** restrict tv = av_fetch(av, j, 0);
3718                                     if (tv && SvOK(*tv)) {
3719
37
                                         total += SvNV(*tv);
3720
37
                                         count++;
3721
25
                                     } else {
3722
27
                                         croak("mean: undefined value at array ref index %zu (argument %zu)", j, i);
3723
27
                                     }
3724
31
                                }
3725                        } else if (SvOK(arg)) {
3726                                 total += SvNV(arg);
3727
25
                                 count++;
3728
41
                        } else {
3729                                 croak("mean: undefined value at argument index %zu", i);
3730                        }
3731                }
3732                if (count == 0) croak("mean needs >= 1 element");
3733                RETVAL = total / count;
3734        OUTPUT:
3735
162
          RETVAL
3736
3737
143
double sum(...)
3738
143
        PROTOTYPE: @
3739
10143
        INIT:
3740          double total = 0;
3741
10142
          size_t count = 0;
3742        CODE:
3743          for (size_t i = 0; i < items; i++) {
3744
10142
                   SV* restrict arg = ST(i);
3745
10063
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3746
10063
                       AV* restrict av = (AV*)SvRV(arg);
3747                       size_t len = av_len(av) + 1;
3748                       for (size_t j = 0; j < len; j++) {
3749                           SV** restrict tv = av_fetch(av, j, 0);
3750
10141
                           if (tv && SvOK(*tv)) {
3751
10012
                               total += SvNV(*tv);
3752
10012
                               count++;
3753                           } else {
3754                               croak("sum: undefined value at array ref index %zu (argument %zu)", j, i);
3755                           }
3756
142
                       }
3757
19
                   } else if (SvOK(arg)) {
3758                       total += SvNV(arg);
3759                       count++;
3760                   } else {
3761
387
                       croak("sum: undefined value at argument index %zu", i);
3762
246
                   }
3763
246
          }
3764          if (count == 0) croak("sum needs >= 1 element");
3765
246
          RETVAL = total;
3766
171
        OUTPUT:
3767
139
          RETVAL
3768
3769
25
double sd(...)
3770
20
    PROTOTYPE: @
3771
14
    INIT:
3772
21
        double mean = 0.0, M2 = 0.0;
3773        size_t count = 0;
3774    CODE:
3775        /* Single Pass Standard Deviation via Welford's Algorithm */
3776
156
        for (size_t i = 0; i < items; i++) {
3777
21
            SV* restrict arg = ST(i);
3778
142
            if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3779
142
                AV* restrict av = (AV*)SvRV(arg);
3780
10153
                size_t len = av_len(av) + 1;
3781
10150
                for (size_t j = 0; j < len; j++) {
3782
10150
                    SV** restrict tv = av_fetch(av, j, 0);
3783
10035
                    if (tv && SvOK(*tv)) {
3784                        count++;
3785
10149
                        double val = SvNV(*tv);
3786
10014
                        double delta = val - mean;
3787                        mean += delta / count;
3788
10146
                        M2 += delta * (val - mean);
3789
10146
                    } else {
3790
1234
                        croak("sd: undefined value at array ref index %zu (argument %zu)", j, i);
3791
1109
                    }
3792
1108
                }
3793
1108
            } else if (SvOK(arg)) {
3794
1108
                count++;
3795
1108
                double val = SvNV(arg);
3796                double delta = val - mean;
3797
145
                mean += delta / count;
3798
136
                M2 += delta * (val - mean);
3799            } else {
3800
159
                croak("sd: undefined value at argument index %zu", i);
3801
32
            }
3802
71
        }
3803
71
        if (count < 2) croak("sd needs >= 2 elements");
3804
68
        RETVAL = sqrt(M2 / (count - 1));
3805
359
    OUTPUT:
3806
338
        RETVAL
3807
3808
3809
312
double var(...)
3810
312
        PROTOTYPE: @
3811        INIT:
3812
68
          double mean = 0.0, M2 = 0.0;
3813
25
          size_t count = 0;
3814
10
        CODE:
3815
89
          /* Single Pass Variance via Welford's Algorithm */
3816
36
          for (size_t i = 0; i < items; i++) {
3817
159
                   SV* restrict arg = ST(i);
3818
112
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3819
112
                       AV* restrict av = (AV*)SvRV(arg);
3820
112
                       size_t len = av_len(av) + 1;
3821
87
                       for (size_t j = 0; j < len; j++) {
3822
82
                           SV** restrict tv = av_fetch(av, j, 0);
3823
47
                           if (tv && SvOK(*tv)) {
3824                               count++;
3825
13
                               double val = SvNV(*tv);
3826
10
                               double delta = val - mean;
3827
8
                               mean += delta / count;
3828
6
                               M2 += delta * (val - mean);
3829
53
                           } else {
3830
7
                               croak("var: undefined value at array ref index %zu (argument %zu)", j, i);
3831
52
                           }
3832
61
                       }
3833
52
                   } else if (SvOK(arg)) {
3834
52
                       count++;
3835
52
                       double val = SvNV(arg);
3836
14
                       double delta = val - mean;
3837
52
                       mean += delta / count;
3838
7
                       M2 += delta * (val - mean);
3839
51
                   } else {
3840
51
                       croak("var: undefined value at argument index %zu", i);
3841                   }
3842
420
          }
3843
375
          if (count < 2) croak("var needs >= 2 elements");
3844
375
          RETVAL = M2 / (count - 1);
3845
375
        OUTPUT:
3846
375
          RETVAL
3847
3848
54
SV* t_test(...)
3849
54
        CODE:
3850
60
        {
3851
18
                SV*restrict x_sv = NULL;
3852                SV*restrict y_sv = NULL;
3853                double mu = 0.0, conf_level = 0.95;
3854
113
                bool paired = FALSE, var_equal = FALSE;
3855
113
                const char*restrict alternative = "two.sided";
3856
3857
209
                int arg_idx = 0;
3858
3859                // 1. Shift first positional argument as 'x' if it's an array reference
3860
223
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
3861
223
                  x_sv = ST(arg_idx);
3862
223
                  arg_idx++;
3863
100
                }
3864
3865
10
                // 2. Shift second positional argument as 'y' if it's an array reference
3866
125
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
3867
17
                  y_sv = ST(arg_idx);
3868
15
                  arg_idx++;
3869
15
                }
3870
3871
132
                // Ensure the remaining arguments form complete key-value pairs
3872
132
                if ((items - arg_idx) % 2 != 0) {
3873
132
                  croak("Usage: t_test(\\@x, [\\@y], key => value, ...)");
3874                }
3875
3876
138
                // --- Parse named arguments from the remaining flat stack ---
3877
138
                for (; arg_idx < items; arg_idx += 2) {
3878
128
                        const char*restrict key = SvPV_nolen(ST(arg_idx));
3879
128
                        SV*restrict val = ST(arg_idx + 1);
3880
3881
128
                        if      (strEQ(key, "x"))           x_sv        = val;
3882
128
                        else if (strEQ(key, "y"))           y_sv        = val;
3883                        else if (strEQ(key, "mu"))          mu          = SvNV(val);
3884                        else if (strEQ(key, "paired"))      paired      = SvTRUE(val);
3885                        else if (strEQ(key, "var_equal"))   var_equal   = SvTRUE(val);
3886                        else if (strEQ(key, "conf_level"))  conf_level  = SvNV(val);
3887                        else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
3888                        else croak("t_test: unknown argument '%s'", key);
3889
47
                }
3890
3891                // --- Validate required / types ---
3892
47
                if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
3893
44
                        croak("t_test: 'x' is a required argument and must be an ARRAY reference");
3894                AV*restrict x_av = (AV*)SvRV(x_sv);
3895
44
                size_t nx = av_len(x_av) + 1;
3896
5
                if (nx < 2) croak("t_test: 'x' needs at least 2 elements");
3897                AV*restrict y_av = NULL;
3898                if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV)
3899                        y_av = (AV*)SvRV(y_sv);
3900
3901
473
                if (conf_level <= 0.0 || conf_level >= 1.0)
3902                        croak("t_test: 'conf_level' must be between 0 and 1");
3903
41
                // --- Computation via Welford's Algorithm --- */
3904
41
                double mean_x = 0.0, M2_x = 0.0, var_x, t_stat, df, p_val, std_err, cint_est;
3905
41
                HV*restrict results = newHV();
3906                for (size_t i = 0; i < nx; i++) {
3907                        SV**restrict tv = av_fetch(x_av, i, 0);
3908                        double val = (tv && SvOK(*tv)) ? SvNV(*tv) : 0;
3909
42
                        double delta = val - mean_x;
3910
42
                        mean_x += delta / (i + 1);
3911                        M2_x += delta * (val - mean_x);
3912
1110
                }
3913
1071
                var_x = M2_x / (nx - 1);
3914
1071
                if (var_x == 0.0 && !y_av) croak("t_test: data are essentially constant");
3915
3916                if (paired || y_av) {
3917                        if (!y_av) croak("t_test: 'y' must be provided for paired or two-sample tests");
3918
42
                        size_t ny = av_len(y_av) + 1;
3919                        if (paired && ny != nx) croak("t_test: Paired arrays must be same length");
3920
42
                        double mean_y = 0.0, M2_y = 0.0, var_y;
3921
162
                        for (size_t i = 0; i < ny; i++) {
3922
156
                                 SV**restrict tv = av_fetch(y_av, i, 0);
3923
188
                                 double val = (tv && SvOK(*tv)) ? SvNV(*tv) : 0;
3924                                 double delta = val - mean_y;
3925
68
                                 mean_y += delta / (i + 1);
3926
41
                                 M2_y += delta * (val - mean_y);
3927
194
                        }
3928
188
                        var_y = M2_y / (ny - 1);
3929
195
                        if (paired) {
3930
195
                                 double mean_d = 0.0, M2_d = 0.0;
3931                                 for (size_t i = 0; i < nx; i++) {
3932
69
                                          SV**restrict dx_ptr = av_fetch(x_av, i, 0);
3933
7
                                          SV**restrict dy_ptr = av_fetch(y_av, i, 0);
3934
160
                                     double dx = (dx_ptr && SvOK(*dx_ptr)) ? SvNV(*dx_ptr) : 0.0;
3935
154
                                     double dy = (dy_ptr && SvOK(*dy_ptr)) ? SvNV(*dy_ptr) : 0.0;
3936
194
                                     double val = dx - dy;
3937
154
                                     double delta = val - mean_d;
3938                                     mean_d += delta / (i + 1);
3939
22
                                     M2_d += delta * (val - mean_d);
3940
7
                                 }
3941
199
                                 double var_d = M2_d / (nx - 1);
3942
193
                                 if (var_d == 0.0) croak("t_test: data are essentially constant");
3943
193
                                 cint_est = mean_d;
3944
195
                                 std_err  = sqrt(var_d / nx);
3945                                 t_stat   = (cint_est - mu) / std_err;
3946
57
                                 df       = nx - 1;
3947
48
                                 hv_store(results, "estimate", 8, newSVnv(mean_d), 0);
3948
201
                        } else if (var_equal) {
3949
48
                                 if (var_x == 0.0 && var_y == 0.0) croak("t_test: data are essentially constant");
3950
201
                                 double pooled_var = ((nx - 1) * var_x + (ny - 1) * var_y) / (nx + ny - 2);
3951
195
                                 cint_est = mean_x - mean_y;
3952
195
                                 std_err  = sqrt(pooled_var * (1.0 / nx + 1.0 / ny));
3953
168
                                 t_stat   = (cint_est - mu) / std_err;
3954                                 df       = nx + ny - 2;
3955
10
                                 hv_store(results, "estimate_x", 10, newSVnv(mean_x), 0);
3956                                 hv_store(results, "estimate_y", 10, newSVnv(mean_y), 0);
3957
20
                        } else {
3958
20
                                 if (var_x == 0.0 && var_y == 0.0) croak("t_test: data are essentially constant");
3959                                 cint_est         = mean_x - mean_y;
3960
20
                                 double stderr_x2 = var_x / nx;
3961
154
                                 double stderr_y2 = var_y / ny;
3962
160
                                 std_err          = sqrt(stderr_x2 + stderr_y2);
3963
304
                                 t_stat           = (cint_est - mu) / std_err;
3964
13
                                 df = pow(stderr_x2 + stderr_y2, 2) /
3965                                      (pow(stderr_x2, 2) / (nx - 1) + pow(stderr_y2, 2) / (ny - 1));
3966                                 hv_store(results, "estimate_x", 10, newSVnv(mean_x), 0);
3967                                 hv_store(results, "estimate_y", 10, newSVnv(mean_y), 0);
3968
172
                        }
3969
166
                } else {
3970
166
                        cint_est = mean_x;
3971                        std_err  = sqrt(var_x / nx);
3972
163
                        t_stat   = (cint_est - mu) / std_err;
3973
513
                        df       = nx - 1;
3974
500
                        hv_store(results, "estimate", 8, newSVnv(mean_x), 0);
3975                }
3976
500
                p_val = get_t_pvalue(t_stat, df, alternative);
3977
3884
                double alpha = 1.0 - conf_level, t_crit, ci_lower, ci_upper;
3978
3397
                if (strcmp(alternative, "less") == 0) {
3979
3397
                        t_crit   = qt_tail(df, alpha);
3980
851
                        ci_lower = -INFINITY;
3981                        ci_upper = cint_est + t_crit * std_err;
3982                } else if (strcmp(alternative, "greater") == 0) {
3983                        t_crit   = qt_tail(df, alpha);
3984
3867
                        ci_lower = cint_est - t_crit * std_err;
3985
3723
                        ci_upper = INFINITY;
3986
3683
                } else {
3987                        t_crit   = qt_tail(df, alpha / 2.0);
3988                        ci_lower = cint_est - t_crit * std_err;
3989
3674
                        ci_upper = cint_est + t_crit * std_err;
3990
3581
                }
3991                AV*restrict conf_int = newAV();
3992                av_push(conf_int, newSVnv(ci_lower));
3993
7395
                av_push(conf_int, newSVnv(ci_upper));
3994
7251
                hv_store(results, "statistic", 9, newSVnv(t_stat), 0);
3995
4254
                hv_store(results, "df",        2, newSVnv(df),     0);
3996                hv_store(results, "p_value",   7, newSVnv(p_val),  0);
3997                hv_store(results, "conf_int",  8, newRV_noinc((SV*)conf_int), 0);
3998                RETVAL = newRV_noinc((SV*)results);
3999        }
4000
168
        OUTPUT:
4001
155
          RETVAL
4002
4003
204
void p_adjust(SV* p_sv, const char* method = "holm")
4004        INIT:
4005
57
                if (!SvROK(p_sv) || SvTYPE(SvRV(p_sv)) != SVt_PVAV) {
4006
54
                        croak("p_adjust: first argument must be an ARRAY reference of p-values");
4007
7
                }
4008
2
                AV *restrict p_av = (AV*)SvRV(p_sv);
4009                size_t n = av_len(p_av) + 1;
4010                // Handle empty input
4011
56
                if (n == 0) {
4012
54
                        XSRETURN_EMPTY;
4013                }
4014                // Normalize method string
4015
87
                char meth[64];
4016
1005
                strncpy(meth, method, 63); meth[63] = '\0';
4017
923
                for(unsigned short int i = 0; meth[i]; i++) meth[i] = tolower(meth[i]);
4018                // Resolve aliases
4019
38
                if (strstr(meth, "benjamini") && strstr(meth, "hochberg")) strcpy(meth, "bh");
4020
89
                if (strstr(meth, "benjamini") && strstr(meth, "yekutieli")) strcpy(meth, "by");
4021                if (strcmp(meth, "fdr") == 0) strcpy(meth, "bh");
4022                // Allocate C memory
4023                PVal *restrict arr;
4024                double *restrict adj;
4025
20
                Newx(arr, n, PVal);
4026                Newx(adj, n, double);
4027
4028                for (size_t i = 0; i < n; i++) {
4029                        SV**restrict tv = av_fetch(p_av, i, 0);
4030
93
                        arr[i].p = (tv && SvOK(*tv)) ? SvNV(*tv) : 1.0;
4031
75
                        arr[i].orig_idx = i;
4032
87
                }
4033
15
                // Sort ascending (Stable sort using original index)
4034
14
                qsort(arr, n, sizeof(PVal), cmp_pval);
4035
131
        PPCODE:
4036
119
                if (strcmp(meth, "bonferroni") == 0) {
4037
168
                        for (size_t i = 0; i < n; i++) {
4038                                double v = arr[i].p * n;
4039
61
                                adj[arr[i].orig_idx] = (v < 1.0) ? v : 1.0;
4040
61
                        }
4041                } else if (strcmp(meth, "holm") == 0) {
4042                        double cummax = 0.0;
4043
18
                        for (size_t i = 0; i < n; i++) {
4044                                 double v = arr[i].p * (n - i);
4045
68
                                 if (v > cummax) cummax = v;
4046                                 adj[arr[i].orig_idx] = (cummax < 1.0) ? cummax : 1.0;
4047
90
                        }
4048
75
                } else if (strcmp(meth, "hochberg") == 0) {
4049
86
                        double cummin = 1.0;
4050
60
                        for (ssize_t i = n - 1; i >= 0; i--) {
4051
60
                                 double v = arr[i].p * (n - i);
4052
177
                                 if (v < cummin) cummin = v;
4053
1293
                                 adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
4054
1245
                        }
4055
1245
                } else if (strcmp(meth, "bh") == 0) {
4056                        double cummin = 1.0;
4057                        for (ssize_t i = n - 1; i >= 0; i--) {
4058
278
                                double v = arr[i].p * n / (i + 1.0);
4059
1284
                                if (v < cummin) cummin = v;
4060                                adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
4061                        }
4062                } else if (strcmp(meth, "by") == 0) {
4063
1239
                        double q = 0.0;
4064
1239
                        for (size_t i = 1; i <= n; i++) q += 1.0 / i;
4065
1236
                        double cummin = 1.0;
4066                        for (ssize_t i = n - 1; i >= 0; i--) {
4067
1179
                                double v = arr[i].p * n / (i + 1.0) * q;
4068                                if (v < cummin) cummin = v;
4069
2463
                                adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
4070
2415
                        }
4071                } else if (strcmp(meth, "hommel") == 0) {
4072                        double *restrict pa, *restrict q_arr;
4073                        Newx(pa, n, double);
4074                        Newx(q_arr, n, double);
4075                        // Initial: min(n * p[i] / (i + 1))
4076                        double min_val = n * arr[0].p;
4077
1419
                        for (size_t i = 1; i < n; i++) {
4078
62
                                double temp = (n * arr[i].p) / (i + 1.0);
4079
57
                                if (temp < min_val) {
4080
54
                                   min_val = temp;
4081                                }
4082                        }
4083                        // pa <- q <- rep(min, n)
4084
66
                        for (size_t i = 0; i < n; i++) {
4085
2
                                 pa[i] = min_val;
4086                                 q_arr[i] = min_val;
4087
16
                        }
4088
15
                        for (size_t j = n - 1; j >= 2; j--) {
4089
15
                                 ssize_t n_mj = n - j;       // Max index for 'ij'. Length is n_mj + 1
4090                                 ssize_t i2_len = j - 1;     // Length of 'i2
4091                                 // Calculate q1 = min(j * p[i2] / (2:j))
4092
16
                                 double q1 = (j * arr[n_mj + 1].p) / 2.0;
4093                                 for (size_t k = 1; k < i2_len; k++) {
4094
16
                                     double temp_q1 = (j * arr[n_mj + 1 + k].p) / (2.0 + k);
4095
27
                                     if (temp_q1 < q1) {
4096
321
                                         q1 = temp_q1;
4097                                     }
4098                                 }
4099                                 // q[ij] <- pmin(j * p[ij], q1)
4100
336
                                 for (size_t i = 0; i <= n_mj; i++) {
4101
27
                                     double v = j * arr[i].p;
4102                                     q_arr[i] = (v < q1) ? v : q1;
4103
27
                                 }
4104
23
                                 // q[i2] <- q[n - j]
4105                                 for (size_t i = 0; i < i2_len; i++) {
4106
23
                                     q_arr[n_mj + 1 + i] = q_arr[n_mj];
4107
33
                                }
4108
27
                                 // pa <- pmax(pa, q)
4109
31
                                for (size_t i = 0; i < n; i++) {
4110
8
                                    if (pa[i] < q_arr[i]) {
4111                                       pa[i] = q_arr[i];
4112                                    }
4113                                }
4114                        }
4115
20
                        // pmin(1, pmax(pa, p))[ro] — map sorted results back to original indices
4116
57
                        for (size_t i = 0; i < n; i++) {
4117                                double v = (pa[i] > arr[i].p) ? pa[i] : arr[i].p;
4118
41
                                if (v > 1.0) v = 1.0;
4119                                adj[arr[i].orig_idx] = v;
4120
53
                        }
4121
43
                        Safefree(pa);  Safefree(q_arr);
4122                } else if (strcmp(meth, "none") == 0) {
4123                        for (size_t i = 0; i < n; i++) {
4124
10
                                adj[arr[i].orig_idx] = arr[i].p;
4125
7
                        }
4126                } else {
4127                        Safefree(arr); Safefree(adj);
4128
15
                        croak("Unknown p-value adjustment method: %s", method);
4129
10
                }
4130                // Push values onto the Perl stack as a flat list
4131
96
                EXTEND(SP, n);
4132
86
                for (size_t i = 0; i < n; i++) {
4133
94
                        PUSHs(sv_2mortal(newSVnv(adj[i])));
4134                }
4135
98
                Safefree(arr); arr = NULL;
4136
93
                Safefree(adj); adj = NULL;
4137
4138double median(...)
4139        PROTOTYPE: @
4140
13
        INIT:
4141
52
          size_t total_count = 0, k = 0;
4142
48
          double* restrict nums;
4143          double median_val = 0.0;
4144        CODE:
4145          /* Pass 1: Count valid elements — die immediately on any undef */
4146
42
          for (size_t i = 0; i < items; i++) {
4147
39
                   SV* restrict arg = ST(i);
4148                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4149                       AV* restrict av = (AV*)SvRV(arg);
4150
3
                       size_t len = av_len(av) + 1;
4151
3
                       for (size_t j = 0; j < len; j++) {
4152
4
                           SV** restrict tv = av_fetch(av, j, 0);
4153                           if (tv && SvOK(*tv)) {
4154
7
                               total_count++;
4155
3
                           } else {
4156                               croak("median: undefined value at array ref index %zu (argument %zu)", j, i);
4157
3
                           }
4158                       }
4159                   } else if (SvOK(arg)) {
4160
17
                       total_count++;
4161
14
                   } else {
4162
13
                       croak("median: undefined value at argument index %zu", i);
4163
1
                   }
4164          }
4165          if (total_count == 0) croak("median needs >= 1 element");
4166
4167
8
          /* Allocate C array now that we know the exact size */
4168
17
          Newx(nums, total_count, double);
4169
4170
12
          /* Pass 2: Populate the C array — Safefree before any croak */
4171
2
          for (size_t i = 0; i < items; i++) {
4172                   SV* restrict arg = ST(i);
4173                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4174                       AV* restrict av = (AV*)SvRV(arg);
4175                       size_t len = av_len(av) + 1;
4176                       for (size_t j = 0; j < len; j++) {
4177
4
                           SV** restrict tv = av_fetch(av, j, 0);
4178                           if (tv && SvOK(*tv)) {
4179
14
                               nums[k++] = SvNV(*tv);
4180
6
                           } else {
4181
29
                               Safefree(nums);
4182
23
                               croak("median: undefined value at array ref index %zu (argument %zu)", j, i);
4183
23
                           }
4184
23
                       }
4185
23
                   } else if (SvOK(arg)) {
4186                       nums[k++] = SvNV(arg);
4187                   } else {
4188                       Safefree(nums);
4189                       croak("median: undefined value at argument index %zu", i);
4190                   }
4191
8
          }
4192
4193          /* Sort and calculate median */
4194          qsort(nums, total_count, sizeof(double), compare_doubles);
4195
16
          if (total_count % 2 == 0) {
4196                   median_val = (nums[total_count / 2 - 1] + nums[total_count / 2]) / 2.0;
4197
8
          } else {
4198
8
                   median_val = nums[total_count / 2];
4199
8
          }
4200          Safefree(nums);
4201
8
          nums = NULL;
4202
14
          RETVAL = median_val;
4203
11
        OUTPUT:
4204
29
          RETVAL
4205
4206
23
SV* cor(SV* x_sv, SV* y_sv = &PL_sv_undef, const char* method = "pearson")
4207
22
        INIT:
4208
18
        // --- validate method -------------------------------------------
4209        if (strcmp(method, "pearson")  != 0 &&
4210                strcmp(method, "spearman") != 0 &&
4211                strcmp(method, "kendall")  != 0)
4212
4
                  croak("cor: unknown method '%s' (use 'pearson', 'spearman', or 'kendall')",
4213
1
                        method);
4214
4215        // --- validate x ------------------------------------------------
4216
3
        if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
4217
3
                  croak("cor: x must be an ARRAY reference");
4218
4219
6
        AV*restrict x_av = (AV*)SvRV(x_sv);
4220
33
        size_t nx   = av_len(x_av) + 1;
4221        if (nx == 0) croak("cor: x is empty");
4222
4223
30
        // --- detect whether x is a flat vector or a matrix (AoA) -------
4224
36
        bool x_is_matrix = 0;
4225
36
        {
4226
33
                  SV**restrict fp = av_fetch(x_av, 0, 0);
4227                  if (fp && SvROK(*fp) && SvTYPE(SvRV(*fp)) == SVt_PVAV)
4228
30
                      x_is_matrix = 1;
4229        }
4230
4231
3
        // --- detect y ----------------------------
4232
3
        bool has_y = (SvOK(y_sv) && SvROK(y_sv) &&
4233
3
                           SvTYPE(SvRV(y_sv)) == SVt_PVAV);
4234
4235
1
        AV*restrict y_av = has_y ? (AV*)SvRV(y_sv) : NULL;
4236
0
        size_t ny = has_y ? av_len(y_av) + 1 : 0;
4237
4238
1
        bool y_is_matrix = 0;
4239
0
        if (has_y && ny > 0) {
4240
1
                SV**restrict fp = av_fetch(y_av, 0, 0);
4241                if (fp && SvROK(*fp) && SvTYPE(SvRV(*fp)) == SVt_PVAV)
4242                        y_is_matrix = 1;
4243        }
4244
4245
1
        CODE:
4246
4
        // Branch 1: both inputs are flat vectors  â†’  scalar result
4247        if (!x_is_matrix && !y_is_matrix) {
4248
3
                  if (!has_y) {
4249
3
                      /* cor(vector) == 1 by definition */
4250                      RETVAL = newSVnv(1.0);
4251                  } else {
4252
9
                      if (nx != ny)
4253
19
                          croak("cor: x and y must have the same length (%lu vs %lu)",
4254
13
                                nx, ny);
4255
4256                      if (nx < 2)
4257
13
                          croak("cor: need at least 2 observations");
4258
4259
6
                      double *restrict xd, *restrict yd;
4260                      Newx(xd, nx, double);
4261
9
                      Newx(yd, ny, double);
4262
4263
6
                      bool x_sd0 = 1, y_sd0 = 1;
4264
11
                      double x_first = NAN, y_first = NAN;
4265
4266                      for (size_t i = 0; i < nx; i++) {
4267
9
                          SV**restrict tv = av_fetch(x_av, i, 0);
4268                          double val = (tv && SvOK(*tv) && looks_like_number(*tv)) ? SvNV(*tv) : NAN;
4269                          xd[i] = val;
4270                          if (!isnan(val)) {
4271                              if (isnan(x_first)) x_first = val;
4272                              else if (val != x_first) x_sd0 = 0;
4273                          }
4274                      }
4275                      for (size_t i = 0; i < ny; i++) {
4276
21
                          SV**restrict tv = av_fetch(y_av, i, 0);
4277
21
                          double val = (tv && SvOK(*tv) && looks_like_number(*tv)) ? SvNV(*tv) : NAN;
4278
21
                          yd[i] = val;
4279                          if (!isnan(val)) {
4280
16
                              if (isnan(y_first)) y_first = val;
4281
16
                              else if (val != y_first) y_sd0 = 0;
4282
17
                          }
4283
7
                      }
4284
4285                      if (x_sd0 || y_sd0) {
4286
7
                          Safefree(xd); Safefree(yd);
4287
7
                          if (x_sd0) croak("cor: standard deviation of x is 0");
4288
9
                          croak("cor: standard deviation of y is 0");
4289
8
                      }
4290
4291                      double r = compute_cor(xd, yd, nx, method);
4292
12
                      Safefree(xd); Safefree(yd);
4293                      RETVAL = newSVnv(r);
4294
12
                  }
4295
9
        } else {//Branch 2: x is a matrix (or y is a matrix)  â†’  AoA result
4296
9
                  // -- resolve x matrix dimensions
4297
3
                  if (!x_is_matrix)
4298
0
                      croak("cor: x must be a matrix (array ref of array refs) "
4299
0
                            "when y is a matrix");
4300
4301
0
                  SV**restrict xr0 = av_fetch(x_av, 0, 0);
4302                  if (!xr0 || !SvROK(*xr0) || SvTYPE(SvRV(*xr0)) != SVt_PVAV)
4303
1
                      croak("cor: each row of x must be an ARRAY reference");
4304
4305                  size_t ncols_x = av_len((AV*)SvRV(*xr0)) + 1;
4306                  if (ncols_x == 0) croak("cor: x matrix has zero columns");
4307
4308
7
                  size_t nrows   = nx;    /* observations */
4309
4310
6
                  // PRE-VALIDATION PASS: Ensure all rows are arrays to prevent memory leaks on croak
4311
5
                  for (size_t i = 0; i < nrows; i++) {
4312
2
                      SV**restrict rv = av_fetch(x_av, i, 0);
4313                      if (!rv || !SvROK(*rv) || SvTYPE(SvRV(*rv)) != SVt_PVAV)
4314
4
                          croak("cor: x row %lu is not an array ref", i);
4315
3
                  }
4316
4317
3
                  if (has_y && y_is_matrix) {
4318
3
                      if (ny != nrows) croak("cor: x and y must have the same number of rows (%lu vs %lu)", nrows, ny);
4319
0
                      for (size_t i = 0; i < nrows; i++) {
4320
0
                          SV**restrict rv = av_fetch(y_av, i, 0);
4321
0
                          if (!rv || !SvROK(*rv) || SvTYPE(SvRV(*rv)) != SVt_PVAV)
4322
0
                              croak("cor: y row %lu is not an array ref", i);
4323
0
                      }
4324                  }
4325
4326                  // -- extract x columns
4327                  double **restrict col_x;
4328                  Newx(col_x, ncols_x, double*);
4329
4330                  for (size_t j = 0; j < ncols_x; j++) {
4331                      Newx(col_x[j], nrows, double);
4332
15
                      bool sd0 = 1;
4333
15
                      double first = NAN;
4334
6
                      for (size_t i = 0; i < nrows; i++) {
4335
6
                          SV**restrict rv = av_fetch(x_av, i, 0);
4336
6
                          AV*restrict  row = (AV*)SvRV(*rv);
4337
9
                          SV**restrict cv  = av_fetch(row, j, 0);
4338
7
                          double val = (cv && SvOK(*cv) && looks_like_number(*cv)) ? SvNV(*cv) : NAN;
4339
6
                          col_x[j][i] = val;
4340
5
                          if (!isnan(val)) {
4341                              if (isnan(first)) first = val;
4342                              else if (val != first) sd0 = 0;
4343                          }
4344                      }
4345
16
                      if (sd0) {
4346                          for (size_t k = 0; k <= j; k++) Safefree(col_x[k]);
4347                          Safefree(col_x);
4348                          croak("cor: standard deviation is 0 in x column %lu", j);
4349
6
                      }
4350
4
                  }
4351
4352
4
                  // -- resolve y: separate matrix or re-use x (symmetric)
4353
6
                  size_t ncols_y;
4354                  double **restrict col_y   = NULL;
4355
4
                  bool symmetric = 0;
4356
4357                  // 1 = cor(X) — result is symmetric
4358
4
                  if (has_y && y_is_matrix) {
4359
8
                      // cross-correlation: X (nrows × p) vs Y (nrows × q)
4360
8
                      SV**restrict yr0 = av_fetch(y_av, 0, 0);
4361
17
                      ncols_y = av_len((AV*)SvRV(*yr0)) + 1;
4362
14
                      if (ncols_y == 0) croak("cor: y matrix has zero columns");
4363
4364
14
                      Newx(col_y, ncols_y, double*);
4365                      for (size_t j = 0; j < ncols_y; j++) {
4366                          Newx(col_y[j], nrows, double);
4367
11
                          bool sd0 = 1;
4368
8
                          double first = NAN;
4369                          for (size_t i = 0; i < nrows; i++) {
4370
8
                              SV**restrict  rv = av_fetch(y_av, i, 0);
4371                              AV*restrict  row = (AV*)SvRV(*rv);
4372
26
                              SV**restrict cv  = av_fetch(row, j, 0);
4373
20
                              double val = (cv && SvOK(*cv) && looks_like_number(*cv)) ? SvNV(*cv) : NAN;
4374
20
                              col_y[j][i] = val;
4375
18
                              if (!isnan(val)) {
4376
20
                                  if (isnan(first)) first = val;
4377
20
                                  else if (val != first) sd0 = 0;
4378                              }
4379
1
                          }
4380                          if (sd0) {
4381
19
                              for (size_t k = 0; k < ncols_x; k++) Safefree(col_x[k]);
4382                              Safefree(col_x);
4383                              for (size_t k = 0; k <= j; k++) Safefree(col_y[k]);
4384
7
                              Safefree(col_y);
4385
6
                              croak("cor: standard deviation is 0 in y column %lu", j);
4386                          }
4387
6
                      }
4388
6
                  } else { // cor(X) — symmetric p×p result; share column arrays
4389
0
                      ncols_y  = ncols_x;
4390
0
                      col_y    = col_x;
4391
2
                      symmetric = 1;
4392                  }
4393
8
                  if (nrows < 2)
4394
25
                      croak("cor: need at least 2 observations (got %lu)", nrows);
4395
19
                  // -- build cache for symmetric case: compute upper triangle, store results, mirror to lower triangle
4396
18
                  AV*restrict result_av = newAV();
4397                  av_extend(result_av, ncols_x - 1);
4398
7
                  // Allocate per-row AVs up front so we can fill them in order
4399                  AV **restrict rows_out;
4400                  Newx(rows_out, ncols_x, AV*);
4401
25
                  for (size_t i = 0; i < ncols_x; i++) {
4402
18
                      rows_out[i] = newAV();
4403
19
                      av_extend(rows_out[i], ncols_y - 1);
4404
19
                  }
4405                  if (symmetric) {
4406
6
/* Upper triangle + diagonal, then mirror. r_cache[i][j] (j >= i) holds the computed value. */
4407                      double **restrict r_cache;
4408
3
                      Newx(r_cache, ncols_x, double*);
4409                      for (size_t i = 0; i < ncols_x; i++)
4410
3
                          Newx(r_cache[i], ncols_x, double);
4411
4412                      for (size_t i = 0; i < ncols_x; i++) {
4413                          r_cache[i][i] = 1.0; // diagonal
4414                          for (size_t j = i + 1; j < ncols_x; j++) {
4415                              double r = compute_cor(col_x[i], col_x[j], nrows, method);
4416
12
                              r_cache[i][j] = r;
4417                              r_cache[j][i] = r; // symmetry
4418
12
                          }
4419
65
                      }
4420
53
                      // fill output AoA from cache
4421
50
                      for (size_t i = 0; i < ncols_x; i++)
4422
2
                          for (size_t j = 0; j < ncols_x; j++)
4423
1
                              av_store(rows_out[i], j, newSVnv(r_cache[i][j]));
4424
4425
1
                      for (size_t i = 0; i < ncols_x; i++) Safefree(r_cache[i]);
4426
1
                      Safefree(r_cache); r_cache = NULL;
4427                  } else {
4428
49
                      // cross-correlation: every (i,j) pair is independent
4429
53
                      for (size_t i = 0; i < ncols_x; i++)
4430                          for (size_t j = 0; j < ncols_y; j++)
4431                              av_store(rows_out[i], j, newSVnv(compute_cor(col_x[i], col_y[j], nrows, method)));
4432
13
                  }
4433
13
                  // push row AVs into result
4434
61
                  for (size_t i = 0; i < ncols_x; i++)
4435
49
                      av_store(result_av, i, newRV_noinc((SV*)rows_out[i]));
4436
49
                  Safefree(rows_out); rows_out = NULL;
4437
1
                  // -- free column arrays -------------------------------------
4438
1
                  for (size_t j = 0; j < ncols_x; j++) Safefree(col_x[j]);
4439
1
                  Safefree(col_x); col_x = NULL;
4440
4
                  if (!symmetric) {
4441
3
                      for (size_t j = 0; j < ncols_y; j++) Safefree(col_y[j]);
4442
3
                      Safefree(col_y);
4443
3
                  }
4444                  RETVAL = newRV_noinc((SV*)result_av);
4445        }
4446
51
        OUTPUT:
4447
50
                RETVAL
4448
4449void scale(...)
4450        PROTOTYPE: @
4451        PPCODE:
4452
20
        {
4453                bool do_center_mean = TRUE, do_scale_sd = TRUE;
4454
18
                double center_val = 0.0, scale_val = 1.0;
4455
15
                size_t data_items = items;
4456
9
                // 1. Parse Options Hash (if it exists as the last argument)
4457
9
                if (items > 0) {
4458                        SV*restrict last_arg = ST(items - 1);
4459
12
                        if (SvROK(last_arg) && SvTYPE(SvRV(last_arg)) == SVt_PVHV) {
4460
36
                                 data_items = items - 1; // Exclude hash from data processing
4461
36
                                 HV*restrict opt_hv = (HV*)SvRV(last_arg);
4462
32
                                 // --- Parse 'center'
4463                                 SV**restrict center_sv = hv_fetch(opt_hv, "center", 6, 0);
4464
8
                                 if (center_sv) {
4465                                     SV*restrict val_sv = *center_sv;
4466
11
                                     if (!SvOK(val_sv)) {
4467
56
                                         do_center_mean = FALSE; center_val = 0.0;
4468
45
                                     } else {
4469
45
                                         char *restrict str = SvPV_nolen(val_sv);
4470
45
                                         /* Trap booleans and empty strings before numeric checks */
4471                                         if (strcasecmp(str, "mean") == 0 || strcasecmp(str, "true") == 0 || strcmp(str, "1") == 0) {
4472
11
                                             do_center_mean = TRUE;
4473                                         } else if (strcasecmp(str, "none") == 0 || strcasecmp(str, "false") == 0 || strcmp(str, "0") == 0 || strcmp(str, "") == 0) {
4474                                             do_center_mean = FALSE; center_val = 0.0;
4475                                         } else if (looks_like_number(val_sv)) {
4476                                             do_center_mean = FALSE; center_val = SvNV(val_sv);
4477                                         } else if (SvTRUE(val_sv)) {
4478                                             do_center_mean = TRUE;
4479
20
                                         } else {
4480
6
                                             do_center_mean = FALSE; center_val = 0.0;
4481                                         }
4482
18
                                     }
4483
14
                                 }
4484
20
                                 // --- Parse 'scale' ---
4485                                 SV**restrict scale_sv = hv_fetch(opt_hv, "scale", 5, 0);
4486
42
                                 if (scale_sv) {
4487
30
                                     SV*restrict val_sv = *scale_sv;
4488
30
                                     if (!SvOK(val_sv)) {
4489
26
                                         do_scale_sd = FALSE; scale_val = 1.0;
4490
13
                                     } else {
4491
13
                                         char *restrict str = SvPV_nolen(val_sv);
4492
10
                                         if (strcasecmp(str, "sd") == 0 || strcasecmp(str, "true") == 0 || strcmp(str, "1") == 0) {
4493
13
                                             do_scale_sd = TRUE;
4494
7
                                         } else if (strcasecmp(str, "none") == 0 || strcasecmp(str, "false") == 0 || strcmp(str, "0") == 0 || strcmp(str, "") == 0) {
4495
23
                                             do_scale_sd = FALSE; scale_val = 1.0;
4496
19
                                         } else if (looks_like_number(val_sv)) {
4497
16
                                             do_scale_sd = FALSE; scale_val = SvNV(val_sv);
4498
0
                                             if (scale_val == 0.0) scale_val = 1.0; /* Prevent Division By Zero */
4499                                         } else if (SvTRUE(val_sv)) {
4500
0
                                             do_scale_sd = TRUE;
4501                                         } else {
4502                                             do_scale_sd = FALSE; scale_val = 1.0;
4503                                         }
4504
12
                                     }
4505
3
                                 }
4506                        }
4507
9
                }
4508
25
                // 2. Detect if the input is a Matrix (Array of Arrays)
4509
25
                bool is_matrix = FALSE;
4510
7
                if (data_items == 1) {
4511                        SV*restrict first_arg = ST(0);
4512                        if (SvROK(first_arg) && SvTYPE(SvRV(first_arg)) == SVt_PVAV) {
4513
10
                                 AV*restrict av = (AV*)SvRV(first_arg);
4514
20
                                 if (av_len(av) >= 0) {
4515
16
                                     SV**restrict first_elem = av_fetch(av, 0, 0);
4516
22
                                     if (first_elem && SvROK(*first_elem) && SvTYPE(SvRV(*first_elem)) == SVt_PVAV) {
4517
3
                                         is_matrix = TRUE;
4518
3
                                     }
4519
0
                                 }
4520                        }
4521                }
4522
6
                if (is_matrix) {
4523
3
                        //=========================================================
4524                        // MATRIX MODE: Scale columns independently (Just like R)
4525                        //=========================================================
4526
3
                        AV*restrict mat_av = (AV*)SvRV(ST(0));
4527
3
                        size_t nrow = av_len(mat_av) + 1, ncol = 0;
4528
4529
19
                        SV**restrict first_row = av_fetch(mat_av, 0, 0);
4530
25
                        ncol = av_len((AV*)SvRV(*first_row)) + 1;
4531
4532
10
                        if (nrow == 0 || ncol == 0) croak("scale requires non-empty matrix");
4533
4534                        // Create a new matrix for the scaled output
4535                        AV*restrict result_av = newAV();
4536
6
                        av_extend(result_av, nrow - 1);
4537
22
                        AV**restrict row_ptrs = (AV**)safemalloc(nrow * sizeof(AV*));
4538                        for (size_t r = 0; r < nrow; r++) {
4539
19
                                 row_ptrs[r] = newAV();
4540
20
                                 av_extend(row_ptrs[r], ncol - 1);
4541
30
                                 av_push(result_av, newRV_noinc((SV*)row_ptrs[r]));
4542
10
                        }
4543
10
                        // Calculate and apply scale per column
4544                        for (size_t c = 0; c < ncol; c++) {
4545
20
                                 double col_sum = 0.0;
4546
21
                                 double *restrict col_data;
4547                                 Newx(col_data, nrow, double);
4548
36
                                 // Extract the column data
4549                                 for (size_t r = 0; r < nrow; r++) {
4550
18
                                     SV**restrict row_sv = av_fetch(mat_av, r, 0);
4551
18
                                     if (row_sv && SvROK(*row_sv)) {
4552                                         AV*restrict row_av = (AV*)SvRV(*row_sv);
4553                                         SV**restrict cell_sv = av_fetch(row_av, c, 0);
4554                                         col_data[r] = (cell_sv && SvOK(*cell_sv)) ? SvNV(*cell_sv) : 0.0;
4555                                     } else {
4556                                         col_data[r] = 0.0;
4557                                     }
4558
81
                                     col_sum += col_data[r];
4559
69
                                 }
4560
4561                                 double col_center = do_center_mean ? (col_sum / nrow) : center_val;
4562                                 double col_scale = scale_val;
4563
70
                                 // Calculate Standard Deviation for this specific column if needed
4564
66
                                 if (do_scale_sd) {
4565
70
                                     if (nrow <= 1) {
4566
70
                                         Safefree(col_data);
4567
70
                                         safefree(row_ptrs);
4568
78
                                         croak("scale needs >= 2 rows to calculate standard deviation for a matrix column");
4569                                     }
4570
74
                                     double sum_sq = 0.0;
4571
74
                                     for (size_t r = 0; r < nrow; r++) {
4572
74
                                         double diff = col_data[r] - col_center;
4573
70
                                         sum_sq += diff * diff;
4574                                     }
4575
70
                                     col_scale = sqrt(sum_sq / (nrow - 1));
4576
69
                                 }
4577
69
                                 // Store scaled values back into the new matrix rows
4578
67
                                 for (size_t r = 0; r < nrow; r++) {
4579                                     double centered = col_data[r] - col_center;
4580                                     double final_val = (col_scale == 0.0) ? (0.0 / 0.0) : (centered / col_scale);
4581
67
                                     av_store(row_ptrs[r], c, newSVnv(final_val));
4582                                 }
4583                                 Safefree(col_data);
4584
67
                        }
4585                        safefree(row_ptrs);
4586
192
                        // Push the resulting matrix as a single Reference onto the Perl stack
4587
126
                        EXTEND(SP, 1);
4588
126
                        PUSHs(sv_2mortal(newRV_noinc((SV*)result_av)));
4589
130
                } else {
4590
64
                        // ======================================
4591
3
                        // FLAT LIST MODE: Original functionality
4592                        // ======================================
4593
69
                        size_t total_count = 0, k = 0;
4594
66
                        double *restrict nums;
4595                        double sum = 0.0;
4596                        for (size_t i = 0; i < data_items; i++) {
4597                                SV*restrict arg = ST(i);
4598                                if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4599
58
                                        AV*restrict av = (AV*)SvRV(arg);
4600
59
                                        size_t len = av_len(av) + 1;
4601
57
                                        for (unsigned int j = 0; j < len; j++) {
4602
57
                                                SV**restrict tv = av_fetch(av, j, 0);
4603
59
                                                if (tv && SvOK(*tv)) { total_count++; }
4604
58
                                        }
4605
58
                                } else if (SvOK(arg)) {
4606
57
                                        total_count++;
4607
38
                                }
4608
37
                        }
4609
37
                        if (total_count == 0) croak("scale requires at least 1 numeric element");
4610
247
                        Newx(nums, total_count, double);
4611                        for (size_t i = 0; i < data_items; i++) {
4612
211
                                 SV*restrict arg = ST(i);
4613
213
                                 if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4614                                     AV*restrict av = (AV*)SvRV(arg);
4615
23
                                     size_t len = av_len(av) + 1;
4616
23
                                     for (size_t j = 0; j < len; j++) {
4617
23
                                         SV**restrict tv = av_fetch(av, j, 0);
4618
22
                                         if (tv && SvOK(*tv)) {
4619
700
                                             double val = SvNV(*tv);
4620                                             nums[k++] = val; sum += val;
4621
678
                                         }
4622
678
                                     }
4623
678
                                 } else if (SvOK(arg)) {
4624                                     double val = SvNV(arg);
4625
0
                                     nums[k++] = val; sum += val;
4626                                 }
4627
0
                        }
4628
6
                        if (do_center_mean) center_val = sum / total_count;
4629
6
                        if (do_scale_sd) {
4630
6
                                 if (total_count <= 1) {
4631
1
                                     Safefree(nums);
4632
1
                                     croak("scale needs >= 2 elements to calculate SD");
4633
22
                                 }
4634
22
                                 double sum_sq = 0.0;
4635
22
                                 for (size_t i = 0; i < total_count; i++) {
4636
22
                                     double diff = nums[i] - center_val;
4637                                     sum_sq += diff * diff;
4638
22
                                 }
4639
22
                                 scale_val = sqrt(sum_sq / (total_count - 1));
4640
22
                        }
4641                        EXTEND(SP, total_count);
4642                        for (size_t i = 0; i < total_count; i++) {
4643
22
                                double centered = nums[i] - center_val;
4644                                double final_val = (scale_val == 0.0) ? (0.0 / 0.0) : (centered / scale_val);
4645                                PUSHs(sv_2mortal(newSVnv(final_val)));
4646                        }
4647                        Safefree(nums); nums = NULL;
4648
79
                }
4649
667
        }
4650
4651SV* matrix(...)
4652
79
CODE:
4653
79
        // Basic check: must have an even number of arguments for key => value
4654
31
        if (items % 2 != 0) {
4655
25
          croak("Usage: matrix(data => [...], nrow => $n, ncol => $m, byrow => $bool)");
4656
25
        }
4657        SV*restrict data_sv = NULL;
4658
76
        size_t nrow = 0, ncol = 0;
4659
76
        bool byrow = FALSE, nrow_set = FALSE, ncol_set = FALSE;
4660
118
        // Parse named arguments
4661        for (size_t i = 0; i < items; i += 2) {
4662          char*restrict key = SvPV_nolen(ST(i));
4663          SV*restrict val   = ST(i + 1);
4664          if (strEQ(key, "data")) {
4665                   data_sv = val;
4666
96
          } else if (strEQ(key, "nrow")) {
4667
309
                   nrow = (size_t)SvUV(val);
4668                   nrow_set = TRUE;
4669
255
          } else if (strEQ(key, "ncol")) {
4670
21
                   ncol = (size_t)SvUV(val);
4671
0
                   ncol_set = TRUE;
4672
22
          } else if (strEQ(key, "byrow")) {
4673                   byrow = SvTRUE(val);
4674          } else {
4675
234
                   croak("Unknown option: %s", key);
4676
22
          }
4677
22
        }
4678
22
        // Validate data input
4679
22
        if (!data_sv || !SvROK(data_sv) || SvTYPE(SvRV(data_sv)) != SVt_PVAV) {
4680          croak("The 'data' option must be an array reference (e.g. data => [1..6])");
4681        }
4682
229
        AV*restrict data_av = (AV*)SvRV(data_sv);
4683
19
        size_t data_len = (UV)(av_top_index(data_av) + 1);
4684
19
        if (data_len == 0) {
4685
19
          croak("Data array cannot be empty");
4686
12
        }
4687        // R-style dimension inference
4688        if (!nrow_set && !ncol_set) {
4689
222
          nrow = data_len;
4690
12
          ncol = 1;
4691
82
        } else if (nrow_set && !ncol_set) {
4692
70
          ncol = (data_len + nrow - 1) / nrow;
4693        } else if (!nrow_set && ncol_set) {
4694          nrow = (data_len + ncol - 1) / ncol;
4695
280
        }
4696
7
        // Final safety check for dimensions
4697        if (nrow == 0 || ncol == 0) {
4698          croak("Dimensions must be greater than 0");
4699
217
        }
4700
7
        // Create the matrix (Array of Arrays)
4701
7
        AV*restrict result_av = newAV();
4702
231
        av_extend(result_av, nrow - 1);
4703        size_t r, c;// Use unsigned types for counters to prevent negative indexing
4704        AV**restrict row_ptrs = (AV**restrict)safemalloc(nrow * sizeof(AV*)); /* Pre-allocate row pointers */
4705
434
        for (r = 0; r < nrow; r++) {
4706
278
          row_ptrs[r] = newAV();
4707
278
          av_extend(row_ptrs[r], ncol - 1);
4708          av_push(result_av, newRV_noinc((SV*)row_ptrs[r]));
4709
210
        }
4710        // Fill the matrix
4711        size_t total_cells = nrow * ncol;
4712        for (size_t i = 0; i < total_cells; i++) {
4713          // Vector recycling logic
4714          SV**restrict fetched = av_fetch(data_av, i % data_len, 0);
4715          SV*restrict val = fetched ? newSVsv(*fetched) : newSV(0);
4716
54
          if (byrow) {
4717
0
                   r = i / ncol;
4718
54
                   c = i % ncol;
4719
54
          } else {
4720
54
                   r = i % nrow;
4721                   c = i / nrow;
4722          }
4723          av_store(row_ptrs[r], c, val);
4724
54
        }
4725
54
        safefree(row_ptrs);
4726
54
        RETVAL = newRV_noinc((SV*)result_av);
4727
132
        OUTPUT:
4728
78
        RETVAL
4729
4730
12
SV* lm(...)
4731
9
CODE:
4732
9
{
4733
28
        const char *restrict formula  = NULL;
4734
224
        SV *restrict data_sv = NULL;
4735
25
        char f_cpy[512];
4736
25
        char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
4737
4738
9
        char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL;
4739
7
        bool *restrict is_dummy = NULL;
4740        char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
4741        unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
4742        size_t n = 0, valid_n = 0, i, j, k, l, l1, l2;
4743        bool has_intercept = TRUE;
4744
4745        char **restrict row_names = NULL, **restrict valid_row_names = NULL;
4746
93
        HV **restrict row_hashes = NULL;
4747
93
        HV *restrict data_hoa = NULL;
4748
93
        SV *restrict ref = NULL;
4749
4750
164
        double *restrict X = NULL, *restrict Y = NULL, *restrict XtX = NULL, *restrict XtY = NULL;
4751        bool *restrict aliased = NULL;
4752        double *restrict beta = NULL;
4753
149
        int final_rank = 0, df_res = 0;
4754        HV *restrict res_hv, *restrict coef_hv, *restrict fitted_hv, *restrict resid_hv, *restrict summary_hv;
4755        AV *restrict terms_av;
4756
54
        double rss = 0.0, rse_sq = 0.0;
4757
54
        HE *restrict entry;
4758
4759        if (items % 2 != 0) croak("Usage: lm(formula => 'mpg ~ wt * hp', data => \\%%mtcars)");
4760
4761        for (unsigned short i_arg = 0; i_arg < items; i_arg += 2) {
4762
55
          const char *restrict key = SvPV_nolen(ST(i_arg));
4763
55
          SV *restrict val = ST(i_arg + 1);
4764
136
          if      (strEQ(key, "formula")) formula = SvPV_nolen(val);
4765
82
          else if (strEQ(key, "data"))    data_sv = val;
4766
70
          else croak("lm: unknown argument '%s'", key);
4767
0
        }
4768        if (!formula) croak("lm: formula is required");
4769
81
        if (!data_sv || !SvROK(data_sv)) croak("lm: data is required and must be a reference");
4770
4771
3
        // ========================================================================
4772
73
        // PHASE 1: Data Extraction
4773
3
        // ========================================================================
4774
3
        ref = SvRV(data_sv);
4775
3
        if (SvTYPE(ref) == SVt_PVHV) {
4776
73
          HV *restrict hv = (HV*)ref;
4777
3
          if (hv_iterinit(hv) == 0) croak("lm: Data hash is empty");
4778
73
          entry = hv_iternext(hv);
4779
3
          if (entry) {
4780
3
                   SV *restrict val = hv_iterval(hv, entry);
4781
3
                   if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
4782
73
                       data_hoa = hv;
4783                       n = av_len((AV*)SvRV(val)) + 1;
4784
96
                       Newx(row_names, n, char*);
4785
96
                       for (i = 0; i < n; i++) {
4786
148
                           char buf[32];
4787                           snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
4788
99
                           row_names[i] = savepv(buf);
4789                       }
4790                   } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
4791                       n = hv_iterinit(hv);
4792
192
                       Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
4793
156
                       i = 0;
4794
276
                       while ((entry = hv_iternext(hv))) {
4795
156
                           I32 len;
4796                           row_names[i] = savepv(hv_iterkey(entry, &len));
4797
72
                           row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
4798                           i++;
4799                       }
4800                   } else croak("lm: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
4801          }
4802
210
        } else if (SvTYPE(ref) == SVt_PVAV) {
4803
156
          AV *restrict av = (AV*)ref; n = av_len(av) + 1;
4804
44
          Newx(row_names, n, char*);
4805
26
          Newx(row_hashes, n, HV*);
4806
1
          for (i = 0; i < n; i++) {
4807                   SV **restrict val = av_fetch(av, i, 0);
4808
142
                   if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
4809
54
                       row_hashes[i] = (HV*)SvRV(*val);
4810                       char buf[32]; snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
4811
90
                       row_names[i] = savepv(buf);
4812
18
                   } else {
4813
18
                       for (k = 0; k < i; k++) Safefree(row_names[k]);
4814
17
                       Safefree(row_names); Safefree(row_hashes);
4815
143
                       croak("lm: Array values must be HashRefs (AoH)");
4816
128
                   }
4817
128
          }
4818
128
        } else croak("lm: Data must be an Array or Hash reference");
4819
4820
151
        // ========================================================================
4821
67
        // PHASE 2: Formula Parsing & `.` Expansion
4822
67
        // ========================================================================
4823        src = (char*)formula; dst = f_cpy;
4824
151
        while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
4825        *dst = '\0';
4826
4827
40
        tilde = strchr(f_cpy, '~');
4828
68
        if (!tilde) {
4829
84
          for (i = 0; i < n; i++) Safefree(row_names[i]);
4830
57
          Safefree(row_names); if (row_hashes) Safefree(row_hashes);
4831
60
          croak("lm: invalid formula, missing '~'");
4832
45
        }
4833
18
        *tilde = '\0';
4834
18
        lhs = f_cpy;
4835
45
        rhs = tilde + 1;
4836
4837
54
        // Remove intercept-suppression markers from RHS.
4838
27
        // IMPORTANT: skip tokens that appear inside I(...) wrappers so that
4839
27
        // expressions like I(x^-1) are never mistakenly treated as "-1".
4840
54
        {
4841
54
          char *restrict p_idx = rhs;
4842
28
          while (*p_idx) {
4843
28
                   // Skip over I(...) sub-expressions entirely
4844                   if (p_idx[0] == 'I' && p_idx[1] == '(') {
4845
58
                       int depth = 0;
4846
16
                       while (*p_idx) { if (*p_idx == '(') depth++; else if (*p_idx == ')') { depth--; if (depth == 0) { p_idx++; break; } } p_idx++; }
4847                       continue;
4848
1
                   }
4849
1
                   // Match bare -1
4850                   if (p_idx[0] == '-' && p_idx[1] == '1' &&
4851                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
4852
73
                       has_intercept = FALSE;
4853                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
4854                       continue; // re-examine same position
4855
55
                   }
4856
55
                   // Match +0
4857
55
                   if (p_idx[0] == '+' && p_idx[1] == '0' &&
4858                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
4859                       has_intercept = FALSE;
4860                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
4861                       continue;
4862
931
                   }
4863
877
                   // Match leading 0+
4864
902
                   if (p_idx == rhs && p_idx[0] == '0' && p_idx[1] == '+') {
4865                       has_intercept = FALSE;
4866
893
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
4867
893
                       continue;
4868
3363
                   }
4869
2533
                   // Match bare 0 (entire rhs)
4870
817
                   if (p_idx == rhs && p_idx[0] == '0' && p_idx[1] == '\0') {
4871
1784
                       has_intercept = FALSE; p_idx[0] = '\0'; break;
4872
280
                   }
4873
252
                   // Strip redundant +1 (keep intercept, just remove marker)
4874
298
                   if (p_idx[0] == '+' && p_idx[1] == '1' &&
4875
280
                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
4876
0
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
4877                       continue;
4878
1464
                   }
4879
1464
                   // Strip leading bare 1 or 1+
4880                   if (p_idx == rhs) {
4881                       if (p_idx[0] == '1' && p_idx[1] == '\0') { p_idx[0] = '\0'; break; }
4882
913
                       if (p_idx[0] == '1' && p_idx[1] == '+') { memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); continue; }
4883                   }
4884
884
                   p_idx++;
4885
3365
          }
4886
872
        }
4887
4888
872
        // Clean up stray `++`, leading `+`, trailing `+`
4889        {
4890
101
          char *restrict p_idx;
4891          while ((p_idx = strstr(rhs, "++")) != NULL)
4892
96
                   memmove(p_idx, p_idx + 1, strlen(p_idx + 1) + 1);
4893
63
          if (rhs[0] == '+') memmove(rhs, rhs + 1, strlen(rhs + 1) + 1);
4894
63
          size_t len_rhs = strlen(rhs);
4895
102
          if (len_rhs > 0 && rhs[len_rhs - 1] == '+') rhs[len_rhs - 1] = '\0';
4896
57
        }
4897
4898        // Expand `.` Operator
4899
20
        char rhs_expanded[2048] = "";
4900
48
        size_t rhs_len = 0;
4901
11
        chunk = strtok(rhs, "+");
4902
20
        while (chunk != NULL) {
4903          if (strcmp(chunk, ".") == 0) {
4904                   AV *cols = get_all_columns(data_hoa, row_hashes, n);
4905                   for (size_t c = 0; c <= (size_t)av_len(cols); c++) {
4906                       SV **col_sv = av_fetch(cols, c, 0);
4907                       if (col_sv && SvOK(*col_sv)) {
4908
70
                           const char *col_name = SvPV_nolen(*col_sv);
4909
196
                           if (strcmp(col_name, lhs) != 0) {
4910
548
                               size_t slen = strlen(col_name);
4911
408
                               if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
4912
7860
                                   if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
4913
399
                                   strcat(rhs_expanded, col_name);
4914                                   rhs_len += slen;
4915
48
                               }
4916
192
                           }
4917
144
                       }
4918
2589
                   }
4919
144
                   SvREFCNT_dec(cols);
4920          } else {
4921
57
                   size_t slen = strlen(chunk);
4922
57
                   if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
4923
57
                       if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
4924
202
                       strcat(rhs_expanded, chunk);
4925
140
                       rhs_len += slen;
4926                   }
4927
132
          }
4928
522
          chunk = strtok(NULL, "+");
4929
156
        }
4930
4931        Newx(terms, term_cap, char*); Newx(uniq_terms, term_cap, char*);
4932        Newx(exp_terms, exp_cap, char*); Newx(is_dummy, exp_cap, bool);
4933        Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
4934
4935        if (has_intercept) { terms[num_terms++] = savepv("Intercept"); }
4936
4937
66
        if (strlen(rhs_expanded) > 0) {
4938          chunk = strtok(rhs_expanded, "+");
4939
66
          while (chunk != NULL) {
4940                   if (num_terms >= term_cap - 3) {
4941                       term_cap *= 2;
4942
358
                       Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
4943
1198
                   }
4944
340
                   char *restrict star = strchr(chunk, '*');
4945                   if (star) {
4946
1195
                       *star = '\0';
4947
1147
                       char *restrict left = chunk;
4948
4415
                       char *restrict right = star + 1;
4949
1681
                       char *restrict c_l = strchr(left, '^');
4950
1115
                       if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
4951
1424
                       char *restrict c_r = strchr(right, '^');
4952
936
                       if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
4953
936
                       terms[num_terms++] = savepv(left);
4954
936
                       terms[num_terms++] = savepv(right);
4955
936
                       size_t inter_len = strlen(left) + strlen(right) + 2;
4956                       terms[num_terms] = (char*)safemalloc(inter_len);
4957
48
                       snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
4958                   } else {
4959                       char *restrict c_chunk = strchr(chunk, '^');
4960
536
                       if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
4961                       terms[num_terms++] = savepv(chunk);
4962
536
                   }
4963
337
                   chunk = strtok(NULL, "+");
4964
337
          }
4965        }
4966
4967
337
        for (i = 0; i < num_terms; i++) {
4968
337
          bool found = FALSE;
4969
337
          for (j = 0; j < num_uniq; j++) { if (strcmp(terms[i], uniq_terms[j]) == 0) { found = TRUE; break; } }
4970
66
          if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
4971
66
        }
4972
7
        p = num_uniq;
4973
4974
7
        // ========================================================================
4975        // PHASE 3: Categorical Expansion
4976
5
        // ========================================================================
4977
5
        for (j = 0; j < p; j++) {
4978          if (p_exp + 32 >= exp_cap) {
4979                   exp_cap *= 2;
4980
185
                   Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
4981
137
                   Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
4982
137
          }
4983
137
          if (strcmp(uniq_terms[j], "Intercept") == 0) {
4984
151
                   exp_terms[p_exp] = savepv("Intercept"); is_dummy[p_exp] = FALSE; p_exp++; continue;
4985
64
          }
4986
181
          if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
4987
136
                   char **restrict levels = NULL;
4988
2623
                   unsigned int num_levels = 0, levels_cap = 8;
4989                   Newx(levels, levels_cap, char*);
4990
265
                   for (i = 0; i < n; i++) {
4991
148
                       char *str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
4992
193
                       if (str_val) {
4993
177
                           bool found = FALSE;
4994
992
                           for (l = 0; l < num_levels; l++) { if (strcmp(levels[l], str_val) == 0) { found = TRUE; break; } }
4995
177
                           if (!found) {
4996
148
                               if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
4997                               levels[num_levels++] = savepv(str_val);
4998
151
                           }
4999                           Safefree(str_val);
5000                       }
5001
64
                   }
5002
109
                   if (num_levels > 0) {
5003
93
                       for (l1 = 0; l1 < num_levels - 1; l1++)
5004
92
                           for (l2 = l1 + 1; l2 < num_levels; l2++)
5005
222
                               if (strcmp(levels[l1], levels[l2]) > 0) { char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp; }
5006
92
                       for (l = 1; l < num_levels; l++) {
5007
64
                           if (p_exp >= exp_cap) {
5008
64
                               exp_cap *= 2;
5009
64
                               Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
5010
64
                               Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5011
350
                           }
5012
64
                           size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
5013
350
                           exp_terms[p_exp] = (char*)safemalloc(t_len);
5014
334
                           snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
5015
1149
                           is_dummy[p_exp] = TRUE;
5016
334
                           dummy_base[p_exp]  = savepv(uniq_terms[j]);
5017
334
                           dummy_level[p_exp] = savepv(levels[l]);
5018                           p_exp++;
5019                       }
5020                       for (l = 0; l < num_levels; l++) Safefree(levels[l]);
5021
457
                       Safefree(levels);
5022
457
                   } else {
5023
469
                       Safefree(levels);
5024
421
                       exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++;
5025
421
                   }
5026          } else {
5027
64
                   exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++;
5028
64
          }
5029
64
        }
5030
64
        p = p_exp;
5031        Newx(X, n * p, double); Newx(Y, n, double);
5032
64
        Newx(valid_row_names, n, char*);
5033
5034        // ========================================================================
5035        // PHASE 4: Matrix Construction & Listwise Deletion
5036        // ========================================================================
5037        for (i = 0; i < n; i++) {
5038          double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
5039          if (isnan(y_val)) { Safefree(row_names[i]); continue; }
5040
5041          bool row_ok = TRUE;
5042          double *restrict row_x = (double*)safemalloc(p * sizeof(double));
5043          for (j = 0; j < p; j++) {
5044
34
                   if (strcmp(exp_terms[j], "Intercept") == 0) {
5045
16
                       row_x[j] = 1.0;
5046
16
                   } else if (is_dummy[j]) {
5047
16
                       char *restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
5048
16
                       if (str_val) {
5049                           row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
5050
16
                           Safefree(str_val);
5051                       } else { row_ok = FALSE; break; }
5052                   } else {
5053                       row_x[j] = evaluate_term(data_hoa, row_hashes, i, exp_terms[j]);
5054
18
                       if (isnan(row_x[j])) { row_ok = FALSE; break; }
5055
0
                   }
5056          }
5057          if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
5058
5059          Y[valid_n] = y_val;
5060          for (j = 0; j < p; j++) X[valid_n * p + j] = row_x[j];
5061
18
          valid_row_names[valid_n] = row_names[i];
5062
18
          valid_n++;
5063
18
          Safefree(row_x);
5064        }
5065
79
        Safefree(row_names);
5066
5067
9126
        if (valid_n <= p) {
5068          for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5069
63
          for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5070          for (j = 0; j < p_exp; j++) {
5071                   Safefree(exp_terms[j]);
5072                   if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5073          }
5074          Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
5075          Safefree(X); Safefree(Y); Safefree(valid_row_names);
5076
51
          if (row_hashes) Safefree(row_hashes);
5077          croak("lm: 0 degrees of freedom (too many NAs or parameters > observations)");
5078
7
        }
5079
5080
7
        // ========================================================================
5081        // PHASE 5: OLS Math
5082        // ========================================================================
5083
7
        Newxz(XtX, p * p, double);
5084
44
        for (i = 0; i < p; i++)
5085
44
          for (j = 0; j < p; j++) {
5086                   double sum = 0.0;
5087                   for (k = 0; k < valid_n; k++) sum += X[k * p + i] * X[k * p + j];
5088                   XtX[i * p + j] = sum;
5089
50
          }
5090
44
        Newxz(XtY, p, double);
5091        for (i = 0; i < p; i++) {
5092          double sum = 0.0;
5093
65
          for (k = 0; k < valid_n; k++) sum += X[k * p + i] * Y[k];
5094
59
          XtY[i] = sum;
5095
59
        }
5096        Newx(aliased, p, bool);
5097
60
        final_rank = sweep_matrix_ols(XtX, p, aliased);
5098
25
        Newxz(beta, p, double);
5099
22
        for (i = 0; i < p; i++) {
5100
16
          if (aliased[i]) { beta[i] = NAN; }
5101          else {
5102                   double sum = 0.0;
5103
22
                   for (j = 0; j < p; j++) if (!aliased[j]) sum += XtX[i * p + j] * XtY[j];
5104                   beta[i] = sum;
5105
19
          }
5106
19
        }
5107
5108        // ========================================================================
5109
15022
        // PHASE 6: Metrics & Cleanup
5110        // ========================================================================
5111        res_hv = newHV(); coef_hv = newHV(); fitted_hv = newHV(); resid_hv = newHV();
5112        summary_hv = newHV(); terms_av = newAV();
5113
5114
19163
        df_res = (int)valid_n - final_rank;
5115
5116
19163
        // rss / mss accumulated here — rse_sq computed AFTER this loop (not before)
5117        double sum_y = 0.0, mss = 0.0;
5118
15016
        for (i = 0; i < valid_n; i++) sum_y += Y[i];
5119        double mean_y = sum_y / (double)valid_n;
5120
5121
15016
        for (i = 0; i < valid_n; i++) {
5122
15013
          double y_hat = 0.0;
5123          for (j = 0; j < p; j++) if (!aliased[j]) y_hat += X[i * p + j] * beta[j];
5124          double res   = Y[i] - y_hat;
5125          rss          += res * res;
5126
19
          double diff_m = has_intercept ? (y_hat - mean_y) : y_hat;
5127          mss          += diff_m * diff_m;
5128          hv_store(fitted_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(y_hat), 0);
5129          hv_store(resid_hv,  valid_row_names[i], strlen(valid_row_names[i]), newSVnv(res),   0);
5130          Safefree(valid_row_names[i]);
5131        }
5132        Safefree(valid_row_names);
5133
5134        // Single, authoritative rse_sq calculation
5135        rse_sq = (df_res > 0) ? (rss / (double)df_res) : NAN;
5136
5137        int df_int = has_intercept ? 1 : 0;
5138        double r_squared = 0.0, adj_r_squared = 0.0, f_stat = NAN, f_pvalue = NAN;
5139        int numdf = final_rank - df_int;
5140
5141
88
        if (final_rank != df_int && (mss + rss) > 0.0) {
5142
72
          r_squared     = mss / (mss + rss);
5143
72
          adj_r_squared = 1.0 - (1.0 - r_squared) * ((valid_n - df_int) / (double)df_res);
5144
43
          if (rse_sq > 0.0 && numdf > 0) {
5145
43
                   f_stat   = (mss / (double)numdf) / rse_sq;
5146
43
                   f_pvalue = 1.0 - pf(f_stat, (double)numdf, (double)df_res);
5147          } else if (rse_sq == 0.0) {
5148
43
                   f_stat   = INFINITY;
5149
43
                   f_pvalue = 0.0;
5150
33
          }
5151
27
        } else if (final_rank == df_int) {
5152          r_squared = 0.0; adj_r_squared = 0.0;
5153
27
        }
5154
5155        for (j = 0; j < p; j++) {
5156
27
          hv_store(coef_hv, exp_terms[j], strlen(exp_terms[j]), newSVnv(beta[j]), 0);
5157          av_push(terms_av, newSVpv(exp_terms[j], 0));
5158          HV *restrict row_hv = newHV();
5159          if (aliased[j]) {
5160                   hv_store(row_hv, "Estimate",   8,  newSVpv("NaN", 0), 0);
5161
27
                   hv_store(row_hv, "Std. Error", 10, newSVpv("NaN", 0), 0);
5162
33
                   hv_store(row_hv, "t value",    7,  newSVpv("NaN", 0), 0);
5163
27
                   hv_store(row_hv, "Pr(>|t|)",   8,  newSVpv("NaN", 0), 0);
5164
33
          } else {
5165
33
                   double se    = sqrt(rse_sq * XtX[j * p + j]);
5166
33
                   double t_val = (se > 0.0) ? (beta[j] / se) : (INFINITY * (beta[j] >= 0.0 ? 1.0 : -1.0));
5167
33
                   double p_val = get_t_pvalue(t_val, df_res, "two.sided");
5168
3060
                   hv_store(row_hv, "Estimate",   8,  newSVnv(beta[j]), 0);
5169
3054
                   hv_store(row_hv, "Std. Error", 10, newSVnv(se),      0);
5170
33
                   hv_store(row_hv, "t value",    7,  newSVnv(t_val),   0);
5171
29
                   hv_store(row_hv, "Pr(>|t|)",   8,  newSVnv(p_val),   0);
5172
203
          }
5173
176
          hv_store(summary_hv, exp_terms[j], strlen(exp_terms[j]), newRV_noinc((SV*)row_hv), 0);
5174
176
        }
5175
5176
2
        hv_store(res_hv, "coefficients",  12, newRV_noinc((SV*)coef_hv),   0);
5177
0
        hv_store(res_hv, "fitted.values", 13, newRV_noinc((SV*)fitted_hv), 0);
5178
0
        hv_store(res_hv, "residuals",      9, newRV_noinc((SV*)resid_hv),  0);
5179
2
        hv_store(res_hv, "df.residual",   11, newSVuv(df_res),             0);
5180
0
        hv_store(res_hv, "rank",           4, newSVuv(final_rank),         0);
5181        hv_store(res_hv, "rss",            3, newSVnv(rss),                0);
5182
7
        hv_store(res_hv, "summary",        7, newRV_noinc((SV*)summary_hv),0);
5183
5
        hv_store(res_hv, "terms",          5, newRV_noinc((SV*)terms_av),  0);
5184
5
        hv_store(res_hv, "r.squared",      9, newSVnv(r_squared),          0);
5185        hv_store(res_hv, "adj.r.squared", 13, newSVnv(adj_r_squared),      0);
5186
5
        if (!isnan(f_stat)) {
5187          AV *fstat_av = newAV();
5188
3
          av_push(fstat_av, newSVnv(f_stat));
5189
2
          av_push(fstat_av, newSViv(numdf));
5190
0
          av_push(fstat_av, newSViv(df_res));
5191
2
          hv_store(res_hv, "fstatistic", 10, newRV_noinc((SV*)fstat_av), 0);
5192
1
          hv_store(res_hv, "f.pvalue",    8, newSVnv(f_pvalue),          0);
5193
1
        }
5194
5195
5002
        // Deep Cleanup
5196
6318
        for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5197        for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5198
6318
        for (j = 0; j < p_exp; j++) {
5199
6318
          Safefree(exp_terms[j]);
5200          if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5201
6318
        }
5202
5000
        Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
5203
5000
        Safefree(X); Safefree(Y); Safefree(XtX); Safefree(XtY);
5204        Safefree(beta); Safefree(aliased);
5205        if (row_hashes) Safefree(row_hashes);
5206
5207        RETVAL = newRV_noinc((SV*)res_hv);
5208}
5209OUTPUT:
5210    RETVAL
5211
5212
334
void seq(from, to, by = 1.0)
5213
36
        double from
5214        double to
5215
36
        double by
5216
36
PPCODE:
5217
18
        {
5218
12
                //Handle the zero 'by' case
5219
12
                if (by == 0.0) {
5220                        if (from == to) {
5221
33
                                 EXTEND(SP, 1);
5222
33
                                 mPUSHn(from);
5223
33
                                 XSRETURN(1);
5224                        } else {
5225                                 croak("invalid 'by' argument: cannot be zero when from != to");
5226
33
                        }
5227
33
                }
5228
33
                // Check for wrong direction / infinite loop
5229
33
                if ((from < to && by < 0.0) || (from > to && by > 0.0)) {
5230
33
                        croak("wrong sign in 'by' argument");
5231
33
                }
5232
33
                /* * Calculate number of elements.
5233                * R uses a small epsilon (like 1e-10) to avoid dropping the last
5234
33
                * element due to floating point inaccuracies.
5235
33
                */
5236
33
                double n_elements_d = (to - from) / by;
5237
33
                if (n_elements_d < 0.0) n_elements_d = 0.0;
5238                size_t n_elements = (n_elements_d + 1e-10) + 1;
5239
33
                // Pre-extend the stack to avoid reallocating inside the loop
5240
33
                EXTEND(SP, n_elements);
5241
33
                for (size_t i = 0; i < n_elements; i++) {
5242
66
                        mPUSHn(from + i * by);
5243
42
                }
5244
12
                XSRETURN(n_elements);
5245
21
        }
5246
5247
67
SV* rnorm(...)
5248
67
        CODE:
5249
9
        {
5250
6
          // Auto-seed the PRNG if the Perl script hasn't done so yet
5251
6
          AUTO_SEED_PRNG();
5252
5253
6
          size_t n = 0;
5254
6
          double mean = 0.0, sd = 1.0;
5255          int arg_start = 0;
5256
5257          // Check if the first argument is a simple integer (rnorm(33))
5258          if (items > 0 && SvIOK(ST(0)) && (items == 1 || items % 2 != 0)) {
5259
3
                   n = (unsigned int)SvUV(ST(0));
5260                   arg_start = 1; // Start parsing named arguments from the second element
5261
30
          }
5262
5263
30
          // --- Parse remaining named arguments from the flat stack ---
5264
30
          if ((items - arg_start) % 2 != 0) {
5265
30
                   croak("Usage: rnorm(n), rnorm(n => 10, mean => 0, sd => 1), or rnorm(33, mean => 0)");
5266          }
5267
5268
33
          for (int i = arg_start; i < items; i += 2) {
5269                   const char* restrict key = SvPV_nolen(ST(i));
5270                   SV* restrict val = ST(i + 1);
5271
5272
24
                   if      (strEQ(key, "n"))    n    = (unsigned int)SvUV(val);
5273
24
                   else if (strEQ(key, "mean")) mean = SvNV(val);
5274
24
                   else if (strEQ(key, "sd"))   sd   = SvNV(val);
5275
24
                   else croak("rnorm: unknown argument '%s'", key);
5276
24
          }
5277
5278          if (sd < 0.0) croak("rnorm: standard deviation must be non-negative");
5279
5280          AV *restrict result_av = newAV();
5281
24
          if (n > 0) {
5282
24
                   av_extend(result_av, n - 1);
5283
60
                   // Generate random normals using the Box-Muller transform
5284
36
                   for (size_t i = 0; i < n; ) {
5285
9
                        double u, v, s;
5286
111
                        do {
5287                            // Drand01() hooks into Perl's internal PRNG, respecting Perl's srand()
5288
45
                            u = 2.0 * Drand01() - 1.0;
5289
45
                            v = 2.0 * Drand01() - 1.0;
5290
12
                            s = u * u + v * v;
5291
6
                        } while (s >= 1.0 || s == 0.0);
5292
5293
4
                        double mul = sqrt(-2.0 * log(s) / s);
5294
11
                        // Box-Muller generates two independent values per iteration
5295
11
                        av_store(result_av, i++, newSVnv(mean + sd * u * mul));
5296
11
                        if (i < n) {
5297
11
                            av_store(result_av, i++, newSVnv(mean + sd * v * mul));
5298
11
                        }
5299
11
                   }
5300
11
          }
5301          RETVAL = newRV_noinc((SV*)result_av);
5302
41
        }
5303
41
        OUTPUT:
5304
41
        RETVAL
5305
5306
44
SV* aov(data_sv, formula_sv)
5307        SV* data_sv
5308        SV* formula_sv
5309        CODE:
5310
98
        {
5311
74
          const char *restrict formula = SvPV_nolen(formula_sv);
5312
137
          char f_cpy[512];
5313
71
          char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
5314
5315
74
          char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL, **restrict parent_term = NULL;
5316          bool *restrict is_dummy = NULL, *is_interact = NULL;
5317
32
          char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
5318          int *restrict term_map = NULL, *restrict left_idx = NULL, *restrict right_idx = NULL;
5319          unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
5320          size_t n = 0, valid_n = 0, i, j;
5321          bool has_intercept = TRUE;
5322
5323
77
          char **restrict row_names = NULL;
5324
25
          HV **restrict row_hashes = NULL;
5325
28
          HV *restrict data_hoa = NULL;
5326
27
          SV *restrict ref = NULL;
5327
27
          HE *restrict entry;
5328
27
          double **restrict X_mat = NULL;
5329          double *restrict Y = NULL;
5330
5331
69
          char **restrict term_base_level = NULL;  /* reference level for each uniq_term (NULL if not categorical) */
5332
26
          if (!SvROK(data_sv)) croak("aov: data is required and must be a reference");
5333
26
          // ========================================================================
5334
26
          // PHASE 1: Data Extraction
5335
26
          // ========================================================================
5336
26
          ref = SvRV(data_sv);
5337
25
          if (SvTYPE(ref) == SVt_PVHV) {
5338                   HV*restrict hv = (HV*)ref;
5339                   if (hv_iterinit(hv) == 0) croak("aov: Data hash is empty");
5340
52
                   entry = hv_iternext(hv);
5341
52
                   if (entry) {
5342                        SV*restrict val = hv_iterval(hv, entry);
5343
16
                        if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
5344
16
                            data_hoa = hv;
5345
16
                            n = av_len((AV*)SvRV(val)) + 1;
5346                            Newx(row_names, n, char*);
5347
17
                            for(i = 0; i < n; i++) {
5348
14
                                char buf[32]; snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i+1));
5349
26
                                row_names[i] = savepv(buf);
5350
20
                            }
5351
20
                        } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
5352                            n = hv_iterinit(hv);
5353                            Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
5354
14
                            i = 0;
5355
11
                            while ((entry = hv_iternext(hv))) {
5356
11
                                I32 len;
5357                                row_names[i] = savepv(hv_iterkey(entry, &len));
5358
14
                                row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
5359
14
                                i++;
5360
23
                            }
5361
12
                        } else croak("aov: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
5362
0
                   }
5363
0
          } else if (SvTYPE(ref) == SVt_PVAV) {
5364
12
                   AV*restrict av = (AV*)ref;
5365
12
                   n = av_len(av) + 1;
5366                   Newx(row_names, n, char*);
5367
4
                   Newx(row_hashes, n, HV*);
5368
4
                   for (i = 0; i < n; i++) {
5369
4
                        SV**restrict val = av_fetch(av, i, 0);
5370
4
                        if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
5371
4
                            row_hashes[i] = (HV*)SvRV(*val);
5372
4
                            char buf[32];
5373
4
                            snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
5374
4
                            row_names[i] = savepv(buf);
5375
4
                        } else {
5376                            for (size_t k = 0; k < i; k++) Safefree(row_names[k]);
5377                            Safefree(row_names); Safefree(row_hashes);
5378                            croak("aov: Array values must be HashRefs (AoH)");
5379
4
                        }
5380                   }
5381
37
          } else croak("aov: Data must be an Array or Hash reference");
5382
5383
20
          // ========================================================================
5384
20
          // PHASE 2: Formula Parsing & `.` Expansion
5385
168
          // ========================================================================
5386
177
          src = (char*)formula; dst = f_cpy;
5387
169
          while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
5388
190
          *dst = '\0';
5389
5390
235
          tilde = strchr(f_cpy, '~');
5391          if (!tilde) {
5392
155
                   for (i = 0; i < n; i++) Safefree(row_names[i]);
5393
29
                   Safefree(row_names); if (row_hashes) Safefree(row_hashes);
5394
50
                   croak("aov: invalid formula, missing '~'");
5395          }
5396
169
          *tilde = '\0';
5397          lhs = f_cpy;
5398          rhs = tilde + 1;
5399
5400
17
          char *restrict p_idx;
5401
29
          while ((p_idx = strstr(rhs, "-1")) != NULL) { has_intercept = FALSE; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
5402
35
          while ((p_idx = strstr(rhs, "+0")) != NULL) { has_intercept = FALSE; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
5403
23
          while ((p_idx = strstr(rhs, "0+")) != NULL) { has_intercept = FALSE; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
5404
11
          if (rhs[0] == '0' && rhs[1] == '\0')        { has_intercept = FALSE; rhs[0] = '\0'; }
5405          while ((p_idx = strstr(rhs, "+1")) != NULL) { memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
5406          if (rhs[0] == '1' && rhs[1] == '\0')        { rhs[0] = '\0'; }
5407          else if (rhs[0] == '1' && rhs[1] == '+')    { memmove(rhs, rhs + 2, strlen(rhs + 2) + 1); }
5408
5409
20
          while ((p_idx = strstr(rhs, "++")) != NULL) memmove(p_idx, p_idx + 1, strlen(p_idx + 1) + 1);
5410
8
          if (rhs[0] == '+') memmove(rhs, rhs + 1, strlen(rhs + 1) + 1);
5411
8
          size_t len_rhs = strlen(rhs);
5412
8
          if (len_rhs > 0 && rhs[len_rhs - 1] == '+') rhs[len_rhs - 1] = '\0';
5413
5414
8
          char rhs_expanded[2048] = "";
5415          size_t rhs_len = 0;
5416
26
          chunk = strtok(rhs, "+");
5417
26
          while (chunk != NULL) {
5418
14
                   if (strcmp(chunk, ".") == 0) {
5419
14
                        AV *restrict cols = get_all_columns(data_hoa, row_hashes, n);
5420
14
                        for (size_t c = 0; c <= av_len(cols); c++) {
5421
14
                            SV **restrict col_sv = av_fetch(cols, c, 0);
5422
14
                            if (col_sv && SvOK(*col_sv)) {
5423
18
                                const char *restrict col_name = SvPV_nolen(*col_sv);
5424
16
                                if (strcmp(col_name, lhs) != 0) {
5425                                    size_t slen = strlen(col_name);
5426
34
                                    if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
5427
11
                                        if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
5428                                        strcat(rhs_expanded, col_name);
5429
1
                                        rhs_len += slen;
5430
1
                                    }
5431
2
                                }
5432
2
                            }
5433
1
                        }
5434
0
                        SvREFCNT_dec(cols);
5435                   } else {
5436                        size_t slen = strlen(chunk);
5437
27
                        if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
5438
27
                            if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
5439
27
                            strcat(rhs_expanded, chunk);
5440
27
                            rhs_len += slen;
5441
28
                        }
5442                   }
5443                   chunk = strtok(NULL, "+");
5444          }
5445
5446
178
          // Setup arrays safely
5447
22
          Newx(terms, term_cap, char*);
5448          Newx(uniq_terms, term_cap, char*);
5449          Newx(exp_terms, exp_cap, char*); Newx(parent_term, exp_cap, char*);
5450          Newx(is_dummy, exp_cap, bool); Newx(is_interact, exp_cap, bool);
5451
178
          Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
5452
157
          Newx(term_map, exp_cap, int); Newx(left_idx, exp_cap, int); Newx(right_idx, exp_cap, int);
5453
5454
157
          if (has_intercept) { terms[num_terms++] = savepv("Intercept"); }
5455
5456
667
          if (strlen(rhs_expanded) > 0) {
5457
522
                   chunk = strtok(rhs_expanded, "+");
5458
159
                   while (chunk != NULL) {
5459
357
                        if (num_terms >= term_cap - 3) {
5460
63
                            term_cap *= 2;
5461
346
                            Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
5462
223
                        }
5463
223
                        char *restrict star = strchr(chunk, '*');
5464
223
                        if (star) {
5465
252
                            *star = '\0';
5466
71
                            char *restrict left = chunk;
5467                            char *right = star + 1;
5468
169
                            char *restrict c_l = strchr(left, '^');
5469
127
                            if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
5470                            char *restrict c_r = strchr(right, '^'); if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
5471                            terms[num_terms++] = savepv(left);
5472
163
                            terms[num_terms++] = savepv(right);
5473                            size_t inter_len = strlen(left) + strlen(right) + 2;
5474
205
                            terms[num_terms] = (char*)safemalloc(inter_len);
5475
669
                            snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
5476
163
                        } else {
5477
165
                            char *restrict c_chunk = strchr(chunk, '^');
5478
161
                            if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
5479                            terms[num_terms++] = savepv(chunk);
5480
22
                        }
5481
24
                        chunk = strtok(NULL, "+");
5482                   }
5483
19
          }
5484
5485
12
          for (i = 0; i < num_terms; i++) {
5486
9
                   bool found = FALSE;
5487
9
                   for (size_t k = 0; k < num_uniq; k++) {
5488                         if (strcmp(terms[i], uniq_terms[k]) == 0) { found = TRUE; break; }
5489
3
                   }
5490
3
                   if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
5491
7
          }
5492
7
          p = num_uniq;
5493
5494
7
          /* ---- NEW: allocate one slot per unique term, zero-initialised so
5495
7
                           non-categorical terms stay NULL                          ---- */
5496
7
          Newxz(term_base_level, num_uniq, char*);
5497          /* ----------------------------------------------------------------------- */
5498
5499          // ========================================================================
5500          // PHASE 3: Categorical & Interaction Expansion
5501
22
          // ========================================================================
5502
22
          for (j = 0; j < p; j++) {
5503
22
                   if (p_exp + 64 >= exp_cap) {
5504                         exp_cap *= 2;
5505                         Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
5506
28
                         Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
5507
21
                         Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5508                         Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
5509
72
                   }
5510
5511
36
                   if (strcmp(uniq_terms[j], "Intercept") == 0) {
5512
33
                         exp_terms[p_exp] = savepv("Intercept");
5513
33
                         parent_term[p_exp] = savepv("Intercept");
5514
33
                         is_dummy[p_exp] = FALSE; is_interact[p_exp] = FALSE;
5515
42
                         term_map[p_exp] = j;
5516                         p_exp++;
5517
27
                         continue;
5518
81
                   }
5519
5520                   char *restrict colon = strchr(uniq_terms[j], ':');
5521
27
                   if (colon) {
5522
124
                         char left[256], right[256];
5523
158
                         strncpy(left, uniq_terms[j], colon - uniq_terms[j]);
5524                         left[colon - uniq_terms[j]] = '\0';
5525
25
                         strcpy(right, colon + 1);
5526
5527                         int *restrict l_indices = (int*)safemalloc(p_exp * sizeof(int)); int l_count = 0;
5528
70
                         int *restrict r_indices = (int*)safemalloc(p_exp * sizeof(int)); int r_count = 0;
5529
121
                         for (size_t e = 0; e < p_exp; e++) {
5530
103
                             if (strcmp(parent_term[e], left) == 0) l_indices[l_count++] = e;
5531
85
                             if (strcmp(parent_term[e], right) == 0) r_indices[r_count++] = e;
5532
255
                         }
5533
5534
85
                         if (l_count == 0 || r_count == 0) {
5535                             Safefree(l_indices); Safefree(r_indices);
5536
151
                             croak("aov: Interaction term '%s' requires its main effects to be explicitly included in the formula", uniq_terms[j]);
5537
53
                         } else {
5538
131
                             for (int li = 0; li < l_count; li++) {
5539
121
                                 for (int ri = 0; ri < r_count; ri++) {
5540
88
                                     if (p_exp >= exp_cap) {
5541
88
                                         exp_cap *= 2;
5542
88
                                         Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
5543                                         Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
5544
3
                                         Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5545
43
                                         Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
5546                                     }
5547
73
                                     size_t t_len = strlen(exp_terms[l_indices[li]]) + strlen(exp_terms[r_indices[ri]]) + 2;
5548                                     exp_terms[p_exp] = (char*)safemalloc(t_len);
5549                                     snprintf(exp_terms[p_exp], t_len, "%s:%s", exp_terms[l_indices[li]], exp_terms[r_indices[ri]]);
5550
70
                                     parent_term[p_exp] = savepv(uniq_terms[j]);
5551
70
                                     is_dummy[p_exp] = FALSE; is_interact[p_exp] = TRUE;
5552
240
                                     left_idx[p_exp] = l_indices[li];
5553
70
                                     right_idx[p_exp] = r_indices[ri];
5554
70
                                     term_map[p_exp] = j;
5555                                     p_exp++;
5556                                 }
5557
121
                             }
5558
76
                         }
5559
79
                         Safefree(l_indices); Safefree(r_indices);
5560
58
                   } else {
5561
58
                         if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
5562                             char **restrict levels = NULL;
5563
22
                             unsigned int num_levels = 0, levels_cap = 8;
5564
21
                             Newx(levels, levels_cap, char*);
5565
21
                             for (i = 0; i < n; i++) {
5566
19
                                 char* str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
5567
19
                                 if (str_val) {
5568
169
                                     bool found = FALSE;
5569
19
                                     for (size_t l = 0; l < num_levels; l++) {
5570
21
                                         if (strcmp(levels[l], str_val) == 0) { found = TRUE; break; }
5571
19
                                     }
5572
19
                                     if (!found) {
5573                                         if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
5574                                         levels[num_levels++] = savepv(str_val);
5575                                     }
5576                                     Safefree(str_val);
5577                                 }
5578                             }
5579
5580                             if (num_levels > 0) {
5581                                 for (size_t l1 = 0; l1 < num_levels - 1; l1++) {
5582
22
                                     for (size_t l2 = l1 + 1; l2 < num_levels; l2++) {
5583                                         if (strcmp(levels[l1], levels[l2]) > 0) {
5584
16
                                             char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp;
5585
16
                                         }
5586
21
                                     }
5587                                 }
5588
5589
27
                                 /* ---- NEW: record the reference (base) level for this term ---- */
5590
12
                                 term_base_level[j] = savepv(levels[0]);
5591
12
                                 /* --------------------------------------------------------------- */
5592
5593
30
                                 for (size_t l = 1; l < num_levels; l++) {
5594
18
                                     if (p_exp >= exp_cap) {
5595
18
                                         exp_cap *= 2;
5596
17
                                         Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
5597                                         Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
5598                                         Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5599                                         Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
5600
26
                                     }
5601
26
                                     size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
5602
26
                                     exp_terms[p_exp] = (char*)safemalloc(t_len);
5603                                     snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
5604
21
                                     parent_term[p_exp] = savepv(uniq_terms[j]);
5605
30
                                     is_dummy[p_exp] = TRUE; is_interact[p_exp] = FALSE;
5606
24
                                     dummy_base[p_exp] = savepv(uniq_terms[j]);
5607
12
                                     dummy_level[p_exp] = savepv(levels[l]);
5608
45
                                     term_map[p_exp] = j;
5609
39
                                     p_exp++;
5610
12
                                 }
5611
12
                                 for (size_t l = 0; l < num_levels; l++) Safefree(levels[l]);
5612
12
                                 Safefree(levels);
5613
29
                             } else {
5614
23
                                 Safefree(levels);
5615
17
                                 exp_terms[p_exp] = savepv(uniq_terms[j]);
5616
17
                                 parent_term[p_exp] = savepv(uniq_terms[j]);
5617
17
                                 is_dummy[p_exp] = FALSE; is_interact[p_exp] = FALSE;
5618
17
                                 term_map[p_exp] = j;
5619
17
                                 p_exp++;
5620                             }
5621
11
                         } else {
5622                             exp_terms[p_exp] = savepv(uniq_terms[j]);
5623
20
                             parent_term[p_exp] = savepv(uniq_terms[j]);
5624                             is_dummy[p_exp] = FALSE; is_interact[p_exp] = FALSE;
5625
30
                             term_map[p_exp] = j;
5626
19
                             p_exp++;
5627
19
                         }
5628
19
                   }
5629
10
          }
5630
10
          X_mat = (double**)safemalloc(n * sizeof(double*));
5631
20
          for(i = 0; i < n; i++) X_mat[i] = (double*)safemalloc(p_exp * sizeof(double));
5632
15
          Newx(Y, n, double);
5633
15
          // ========================================================================
5634
15
          // PHASE 4: Matrix Construction & Listwise Deletion
5635
15
          // ========================================================================
5636
15
          for (i = 0; i < n; i++) {
5637
15
                   double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
5638
6
                   if (isnan(y_val)) { Safefree(row_names[i]); continue; }
5639                   bool row_ok = TRUE;
5640
15
                   double *restrict row_x = (double*)safemalloc(p_exp * sizeof(double));
5641
31
                   for (j = 0; j < p_exp; j++) {
5642
25
                         if (strcmp(exp_terms[j], "Intercept") == 0) {
5643
25
                             row_x[j] = 1.0;
5644
25
                         } else if (is_interact[j]) {
5645
25
                             row_x[j] = row_x[left_idx[j]] * row_x[right_idx[j]];
5646
25
                         } else if (is_dummy[j]) {
5647
160
                             char*restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
5648
144
                             if (str_val) {
5649
144
                                 row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
5650
25
                                 Safefree(str_val);
5651
25
                             } else { row_ok = FALSE; break; }
5652
25
                         } else {
5653
15
                             row_x[j] = evaluate_term(data_hoa, row_hashes, i, parent_term[j]);
5654
15
                             if (isnan(row_x[j])) { row_ok = FALSE; break; }
5655
15
                         }
5656
15
                   }
5657
15
                   if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
5658
5659                   Y[valid_n] = y_val;
5660
23
                   for (j = 0; j < p_exp; j++) X_mat[valid_n][j] = row_x[j];
5661                   valid_n++;
5662                   Safefree(row_x);
5663                   Safefree(row_names[i]);
5664
39
          }
5665          Safefree(row_names);
5666
33
          if (valid_n <= p_exp) {
5667                   // Full Clean Up
5668                   for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5669
33
                   for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5670
21
                   for (j = 0; j < p_exp; j++) {
5671
21
                        Safefree(exp_terms[j]); Safefree(parent_term[j]);
5672
21
                        if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5673
21
                   }
5674
21
                   Safefree(exp_terms); Safefree(parent_term);
5675
71
                   Safefree(is_dummy); Safefree(is_interact);
5676
21
                   Safefree(dummy_base); Safefree(dummy_level);
5677
21
                   Safefree(term_map); Safefree(left_idx); Safefree(right_idx);
5678
21
                   for(i = 0; i < n; i++) Safefree(X_mat[i]);
5679
21
                   Safefree(X_mat); Safefree(Y);
5680                   if (row_hashes) Safefree(row_hashes);
5681
21
                   /* ---- NEW ---- */
5682                   for (i = 0; i < num_uniq; i++) { if (term_base_level[i]) Safefree(term_base_level[i]); }
5683                   Safefree(term_base_level);
5684                   /* ------------ */
5685                   croak("aov: 0 degrees of freedom (too many NAs or parameters > observations)");
5686          }
5687        // ========================================================================
5688          // PHASE 5: Math & Output Formatting
5689
26
          // ========================================================================
5690
26
          bool *restrict aliased_qr = (bool*)safemalloc(p_exp * sizeof(bool));
5691
26
          size_t *restrict rank_map = (size_t*)safemalloc(p_exp * sizeof(size_t));
5692
28
          apply_householder_aov(X_mat, Y, valid_n, p_exp, aliased_qr, rank_map);
5693
23
          double *restrict term_ss;
5694          int *restrict term_df;
5695
23
          Newxz(term_ss, num_uniq, double);
5696
23
          Newxz(term_df, num_uniq, int);
5697
5698
21
          for (i = 0; i < p_exp; i++) {
5699                   if (strcmp(exp_terms[i], "Intercept") == 0) continue;
5700
23
                   if (aliased_qr[i]) continue;
5701
104
                   int t_idx = term_map[i];
5702
86
                   size_t r_k = rank_map[i];
5703
86
                   term_ss[t_idx] += Y[r_k] * Y[r_k];
5704                   term_df[t_idx] += 1;
5705
86
          }
5706
83
          int rank = 0;
5707
59
          for (i = 0; i < p_exp; i++) {
5708
38
                   if (!aliased_qr[i]) rank++;
5709
35
          }
5710
17
          double rss_prev = 0.0;
5711
8
          for (i = rank; i < valid_n; i++) {
5712
2
                   rss_prev += Y[i] * Y[i];
5713
2
          }
5714
2
          int res_df = valid_n - rank;
5715          double ms_res = (res_df > 0) ? rss_prev / res_df : 0.0;
5716
5717
23
          HV*restrict ret_hash = newHV();
5718
23
          for (j = 0; j < num_uniq; j++) {
5719
23
                   if (strcmp(uniq_terms[j], "Intercept") == 0) continue;
5720
23
                   HV*restrict term_stats = newHV();
5721
23
                   double ss = term_ss[j];
5722                   int df = term_df[j];
5723
23
                   double ms = (df > 0) ? ss / df : 0.0;
5724
5725
21
                   hv_stores(term_stats, "Df", newSViv(df));
5726
24
                   hv_stores(term_stats, "Sum Sq", newSVnv(ss));
5727
24
                   hv_stores(term_stats, "Mean Sq", newSVnv(ms));
5728
24
                   if (ms_res > 0.0 && df > 0) {
5729                         double f_val = ms / ms_res;
5730
24
                         hv_stores(term_stats, "F value", newSVnv(f_val));
5731
3
                         hv_stores(term_stats, "Pr(>F)", newSVnv(1.0 - pf(f_val, (double)df, (double)res_df)));
5732                   } else {
5733                         hv_stores(term_stats, "F value", newSVnv(NAN));
5734
24
                         hv_stores(term_stats, "Pr(>F)", newSVnv(NAN));
5735
24
                   }
5736
24
                   hv_store(ret_hash, uniq_terms[j], strlen(uniq_terms[j]), newRV_noinc((SV*)term_stats), 0);
5737
24
          }
5738
5739
24
          HV*restrict res_stats = newHV();
5740
24
          hv_stores(res_stats, "Df", newSViv(res_df));
5741
24
          hv_stores(res_stats, "Sum Sq", newSVnv(rss_prev));
5742
24
          hv_stores(res_stats, "Mean Sq", newSVnv(ms_res));
5743
3
          hv_stores(ret_hash, "Residuals", newRV_noinc((SV*)res_stats));
5744
5745
21
          /* ---- NEW: group_stats => { mean => {...}, size => {...} } ---- */
5746
21
          {
5747
687
                   AV *restrict all_cols = get_all_columns(data_hoa, row_hashes, n);
5748
669
                   HV *restrict mean_hv  = newHV();
5749
669
                   HV *restrict size_hv  = newHV();
5750
522
                   for (size_t c = 0; c <= (size_t)av_len(all_cols); c++) {
5751                       SV **restrict col_sv = av_fetch(all_cols, c, 0);
5752
21
                       if (!col_sv || !SvOK(*col_sv)) continue;
5753
3
                       const char *restrict col_name = SvPV_nolen(*col_sv);
5754
3
                       double col_sum = 0.0;
5755
3
                       IV     col_count = 0;
5756
3
                       for (i = 0; i < n; i++) {
5757
3
                           double val = evaluate_term(data_hoa, row_hashes, i, col_name);
5758
3
                           if (!isnan(val)) { col_sum += val; col_count++; }
5759                       }
5760
3
                       double col_mean = (col_count > 0) ? col_sum / col_count : NAN;
5761
3
                       hv_store(mean_hv, col_name, strlen(col_name), newSVnv(col_mean), 0);
5762
3
                       hv_store(size_hv, col_name, strlen(col_name), newSViv(col_count), 0);
5763
3
                   }
5764
3
                   SvREFCNT_dec(all_cols);
5765
0
                   HV *restrict gs_hv = newHV();
5766
5
                   hv_stores(gs_hv, "mean", newRV_noinc((SV*)mean_hv));
5767
5
                   hv_stores(gs_hv, "size", newRV_noinc((SV*)size_hv));
5768                   hv_stores(ret_hash, "group_stats", newRV_noinc((SV*)gs_hv));
5769
5
          }
5770
5
          /* -------------------------------------------------------------- */
5771
5772
5
          // Deep Cleanup
5773
5
          for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5774
5
          for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5775
5
          for (j = 0; j < p_exp; j++) {
5776                   Safefree(exp_terms[j]); Safefree(parent_term[j]);
5777
5
                   if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5778          }
5779
26
          Safefree(exp_terms); Safefree(parent_term);
5780
26
          Safefree(is_dummy); Safefree(is_interact);
5781
26
          Safefree(dummy_base); Safefree(dummy_level);
5782
26
          Safefree(term_map); Safefree(left_idx); Safefree(right_idx);
5783
28
          Safefree(term_ss); Safefree(term_df);
5784
28
          for (i = 0; i < n; i++) Safefree(X_mat[i]);
5785
28
          Safefree(X_mat); Safefree(Y);
5786
28
          Safefree(aliased_qr); Safefree(rank_map);
5787
28
          if (row_hashes) Safefree(row_hashes);
5788
28
          RETVAL = newRV_noinc((SV*)ret_hash);
5789
28
        }
5790
28
OUTPUT:
5791    RETVAL
5792
5793PROTOTYPES: DISABLE
5794
5795SV* fisher_test(...)
5796CODE:
5797{
5798
15
        if (items < 1) croak("fisher_test requires at least a data reference");
5799
5800        SV*restrict data_ref = ST(0);
5801        double conf_level = 0.95;
5802        const char*restrict alternative = "two.sided";
5803
5804
33
        // Parse named arguments
5805
33
        for (unsigned short int i = 1; i < items; i += 2) {
5806
30
                if (i + 1 >= items) croak("fisher_test: odd number of arguments");
5807
29
                const char*restrict key = SvPV_nolen(ST(i));
5808
22
                SV*restrict val = ST(i + 1);
5809                if (strEQ(key, "conf_level") || strEQ(key, "conf.level")) {
5810                        conf_level = SvNV(val);
5811
20
                } else if (strEQ(key, "alternative")) {
5812
16
                        alternative = SvPV_nolen(val);
5813
8
                }
5814
5
        }
5815
5816        if (!SvROK(data_ref)) croak("fisher_test requires a reference to an Array or Hash");
5817
12
        SV*restrict deref = SvRV(data_ref);
5818
4
        size_t a = 0, b = 0, c = 0, d = 0;
5819
4
        // Extract Data
5820
11
        if (SvTYPE(deref) == SVt_PVAV) {
5821
9
          AV*restrict outer = (AV*)deref;
5822
7
          if (av_len(outer) != 1) croak("Outer array must have exactly 2 rows");
5823
7
          SV**restrict row1_ptr = av_fetch(outer, 0, 0);
5824          SV**restrict row2_ptr = av_fetch(outer, 1, 0);
5825          if (row1_ptr && row2_ptr && SvROK(*row1_ptr) && SvROK(*row2_ptr)) {
5826
15
                   AV*restrict row1 = (AV*)SvRV(*row1_ptr);
5827
7
                   AV*restrict row2 = (AV*)SvRV(*row2_ptr);
5828                   SV**restrict a_ptr = av_fetch(row1, 0, 0);
5829                   SV**restrict b_ptr = av_fetch(row1, 1, 0);
5830                   SV**restrict c_ptr = av_fetch(row2, 0, 0);
5831
15
                   SV**restrict d_ptr = av_fetch(row2, 1, 0);
5832
15
                   a = (a_ptr && SvOK(*a_ptr)) ? SvIV(*a_ptr) : 0;
5833
15
                   b = (b_ptr && SvOK(*b_ptr)) ? SvIV(*b_ptr) : 0;
5834                   c = (c_ptr && SvOK(*c_ptr)) ? SvIV(*c_ptr) : 0;
5835                   d = (d_ptr && SvOK(*d_ptr)) ? SvIV(*d_ptr) : 0;
5836          } else {
5837                  croak("Invalid 2D Array structure");
5838
15
          }
5839
10
        } else if (SvTYPE(deref) == SVt_PVHV) {
5840
7
                // Fixed 2D Hash Logic: Sort keys lexically to enforce structured rows/columns
5841
3
                HV*restrict outer = (HV*)deref;
5842                if (hv_iterinit(outer) != 2) croak("Outer hash must have exactly 2 keys");
5843
10
                HE*restrict he1 = hv_iternext(outer);
5844
10
                HE*restrict he2 = hv_iternext(outer);
5845                if (!he1 || !he2) croak("Invalid outer hash");
5846
19
                const char*restrict k1 = SvPV_nolen(hv_iterkeysv(he1));
5847
16
                const char*restrict k2 = SvPV_nolen(hv_iterkeysv(he2));
5848
16
                HE*restrict row1_he = (strcmp(k1, k2) < 0) ? he1 : he2;
5849
7
                HE*restrict row2_he = (strcmp(k1, k2) < 0) ? he2 : he1;
5850
16
                SV*restrict row1_sv = hv_iterval(outer, row1_he);
5851                SV*restrict row2_sv = hv_iterval(outer, row2_he);
5852
10
                if (!SvROK(row1_sv) || SvTYPE(SvRV(row1_sv)) != SVt_PVHV ||
5853                        !SvROK(row2_sv) || SvTYPE(SvRV(row2_sv)) != SVt_PVHV) {
5854
10
                        croak("Inner elements must be hashes");
5855                }
5856                HV*restrict in1 = (HV*)SvRV(row1_sv);
5857
4
                HV*restrict in2 = (HV*)SvRV(row2_sv);
5858
9
                if (hv_iterinit(in1) != 2 || hv_iterinit(in2) != 2) croak("Inner hashes must have exactly 2 keys");
5859
18
                HE*restrict in1_he1 = hv_iternext(in1);
5860
15
                HE*restrict in1_he2 = hv_iternext(in1);
5861
237
                const char*restrict in1_k1 = SvPV_nolen(hv_iterkeysv(in1_he1));
5862
273
                const char*restrict in1_k2 = SvPV_nolen(hv_iterkeysv(in1_he2));
5863
264
                HE*restrict in1_c1 = (strcmp(in1_k1, in1_k2) < 0) ? in1_he1 : in1_he2;
5864
215
                HE*restrict in1_c2 = (strcmp(in1_k1, in1_k2) < 0) ? in1_he2 : in1_he1;
5865
48
                HE*restrict in2_he1 = hv_iternext(in2);
5866
42
                HE*restrict in2_he2 = hv_iternext(in2);
5867
42
                const char*restrict in2_k1 = SvPV_nolen(hv_iterkeysv(in2_he1));
5868                const char*restrict in2_k2 = SvPV_nolen(hv_iterkeysv(in2_he2));
5869                HE*restrict in2_c1 = (strcmp(in2_k1, in2_k2) < 0) ? in2_he1 : in2_he2;
5870
9
                HE*restrict in2_c2 = (strcmp(in2_k1, in2_k2) < 0) ? in2_he2 : in2_he1;
5871                a = (hv_iterval(in1, in1_c1) && SvOK(hv_iterval(in1, in1_c1))) ? SvIV(hv_iterval(in1, in1_c1)) : 0;
5872
3
                b = (hv_iterval(in1, in1_c2) && SvOK(hv_iterval(in1, in1_c2))) ? SvIV(hv_iterval(in1, in1_c2)) : 0;
5873                c = (hv_iterval(in2, in2_c1) && SvOK(hv_iterval(in2, in2_c1))) ? SvIV(hv_iterval(in2, in2_c1)) : 0;
5874                d = (hv_iterval(in2, in2_c2) && SvOK(hv_iterval(in2, in2_c2))) ? SvIV(hv_iterval(in2, in2_c2)) : 0;
5875        } else {
5876          croak("Input must be a 2D Array or 2D Hash");
5877        }
5878
5879
0
        // Perform Calculations via Helpers
5880
5
        double p_val = exact_p_value(a, b, c, d, alternative);
5881
0
        double mle_or, ci_low, ci_high;
5882        calculate_exact_stats(a, b, c, d, conf_level, alternative, &mle_or, &ci_low, &ci_high);
5883
5884
5
        // Construct the Return HashRef purely in C
5885
5
        HV*restrict ret_hash = newHV();
5886
5
        hv_stores(ret_hash, "method", newSVpv("Fisher's Exact Test for Count Data", 0));
5887
5
        hv_stores(ret_hash, "alternative", newSVpv(alternative, 0));
5888
5
        AV*restrict ci_array = newAV();
5889        av_push(ci_array, newSVnv(ci_low));
5890
5
        av_push(ci_array, newSVnv(ci_high));
5891        hv_stores(ret_hash, "conf_int", newRV_noinc((SV*)ci_array));
5892        HV*restrict est_hash = newHV();
5893
5
        hv_stores(ret_hash, "estimate", newRV_noinc((SV*)est_hash));
5894
5
        hv_stores(est_hash, "odds ratio", newSVnv(mle_or));
5895        hv_stores(ret_hash, "p_value", newSVnv(p_val));
5896
75
        // Return the HashRef
5897
70
        RETVAL = newRV_noinc((SV*)ret_hash);
5898
70
}
5899
70
OUTPUT:
5900
70
  RETVAL
5901
5902
77
SV* power_t_test(...)
5903
77
CODE:
5904{
5905
77
        SV*restrict sv_n = NULL;
5906
62
        SV*restrict sv_delta = NULL;
5907        SV*restrict sv_sd = NULL;
5908
22
        SV*restrict sv_sig_level = NULL;
5909
22
        SV*restrict sv_power = NULL;
5910
5911
77
        const char* restrict type = "two.sample";
5912
77
        const char* restrict alternative = "two.sided";
5913
77
        bool strict = FALSE;
5914        double tol = pow(2.2204460492503131e-16, 0.25);
5915
5916
12
        if (items % 2 != 0) croak("Usage: power_t_test(n => 30, delta => 0.5, sd => 1.0, ...)");
5917
12
        for (unsigned short int i = 0; i < items; i += 2) {
5918          const char* restrict key = SvPV_nolen(ST(i));
5919          SV* restrict val = ST(i+1);
5920
5921          if      (strEQ(key, "n"))           sv_n = val;
5922          else if (strEQ(key, "delta"))       sv_delta = val;
5923
11
          else if (strEQ(key, "sd"))          sv_sd = val;
5924
11
          else if (strEQ(key, "sig.level") || strEQ(key, "sig_level")) sv_sig_level = val;
5925          else if (strEQ(key, "power"))       sv_power = val;
5926          else if (strEQ(key, "type"))        type = SvPV_nolen(val);
5927
11
          else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
5928
10
          else if (strEQ(key, "strict"))      strict = SvTRUE(val);
5929          else if (strEQ(key, "tol"))         tol = SvNV(val);
5930          else croak("power_t_test: unknown argument '%s'", key);
5931
10
        }
5932
5933
121
        bool is_null_n = (!sv_n || !SvOK(sv_n));
5934
113
        bool is_null_delta = (!sv_delta || !SvOK(sv_delta));
5935
115
        bool is_null_power = (!sv_power || !SvOK(sv_power));
5936
114
        bool is_null_sd = (sv_sd && !SvOK(sv_sd));
5937        bool is_null_sig_level = (sv_sig_level && !SvOK(sv_sig_level));
5938
5939        unsigned int missing_count = 0;
5940
9
        if (is_null_n) missing_count++;
5941
33
        if (is_null_delta) missing_count++;
5942
29
        if (is_null_power) missing_count++;
5943
26
        if (is_null_sd) missing_count++;
5944
26
        if (is_null_sig_level) missing_count++;
5945
5946
10
        if (missing_count != 1) {
5947          croak("power_t_test: exactly one of 'n', 'delta', 'sd', 'power', and 'sig_level' must be undef/NULL");
5948
9
        }
5949
5950
8
        double n = is_null_n ? 0.0 : SvNV(sv_n);
5951
3
        double delta = is_null_delta ? 0.0 : SvNV(sv_delta);
5952
0
        double sd = (!sv_sd || is_null_sd) ? 1.0 : SvNV(sv_sd);
5953        double sig_level = (!sv_sig_level || is_null_sig_level) ? 0.05 : SvNV(sv_sig_level);
5954
11
        double power = is_null_power ? 0.0 : SvNV(sv_power);
5955
11
        short int tsample = (strEQ(type, "one.sample") || strEQ(type, "paired")) ? 1 : 2;
5956        short int tside = (strEQ(alternative, "one.sided") || strEQ(alternative, "greater") || strEQ(alternative, "less")) ? 1 : 2;
5957        if (tside == 2 && !is_null_delta) delta = fabs(delta);
5958
11
        if (is_null_power) {
5959
11
          power = p_body(n, delta, sd, sig_level, tsample, tside, strict);
5960
11
        } else if (is_null_n) {
5961
9
          double low = 2.0, high = 1e7;
5962
8
          while (p_body(high, delta, sd, sig_level, tsample, tside, strict) < power && high < 1e12) high *= 2.0;
5963
9
          while (high - low > tol) {
5964
9
                   double mid = low + (high - low) / 2.0;
5965                   if (p_body(mid, delta, sd, sig_level, tsample, tside, strict) < power) low = mid;
5966                   else high = mid;
5967          }
5968          n = low + (high - low) / 2.0;
5969        } else if (is_null_sd) {
5970          double low = delta * 1e-7, high = delta * 1e7;
5971          while (high - low > tol) {
5972                   double mid = low + (high - low) / 2.0;
5973                   if (p_body(n, delta, mid, sig_level, tsample, tside, strict) > power) low = mid;
5974                   else high = mid;
5975          }
5976          sd = low + (high - low) / 2.0;
5977        } else if (is_null_delta) {
5978          double low = sd * 1e-7, high = sd * 1e7;
5979          while (p_body(n, high, sd, sig_level, tsample, tside, strict) < power && high < 1e12) high *= 2.0;
5980          while (high - low > tol) {
5981                   double mid = low + (high - low) / 2.0;
5982                   if (p_body(n, mid, sd, sig_level, tsample, tside, strict) < power) low = mid;
5983                   else high = mid;
5984          }
5985          delta = low + (high - low) / 2.0;
5986        } else if (is_null_sig_level) {
5987          double low = 1e-10, high = 1.0 - 1e-10;
5988          while (high - low > tol) {
5989                   double mid = low + (high - low) / 2.0;
5990                   if (p_body(n, delta, sd, mid, tsample, tside, strict) < power) low = mid;
5991                   else high = mid;
5992          }
5993          sig_level = low + (high - low) / 2.0;
5994        }
5995        HV*restrict ret = newHV();
5996        hv_stores(ret, "n", newSVnv(n));
5997        hv_stores(ret, "delta", newSVnv(delta));
5998        hv_stores(ret, "sd", newSVnv(sd));
5999        hv_stores(ret, "sig.level", newSVnv(sig_level));
6000        hv_stores(ret, "power", newSVnv(power));
6001        hv_stores(ret, "alternative", newSVpv(alternative, 0));
6002        const char*restrict m_str = (tsample == 1) ? (strEQ(type, "paired") ? "Paired t test power calculation" : "One-sample t test power calculation") : "Two-sample t test power calculation";
6003        hv_stores(ret, "method", newSVpv(m_str, 0));
6004        const char*restrict n_str = (tsample == 2) ? "n is number in *each* group" : (strEQ(type, "paired") ? "n is number of *pairs*, sd is std.dev. of *differences* within pairs" : "");
6005        if (n_str[0] != '\0') hv_stores(ret, "note", newSVpv(n_str, 0));
6006        RETVAL = newRV_noinc((SV*)ret);
6007}
6008OUTPUT:
6009    RETVAL
6010
6011SV* kruskal_test(...)
6012CODE:
6013{
6014        SV *restrict x_sv = NULL, *restrict g_sv = NULL, *restrict h_sv = NULL;
6015        unsigned int arg_idx = 0;
6016
6017        // 1. Shift positional arguments
6018        //    Accept either: (arrayref, arrayref) or (hashref)
6019        if (arg_idx < items && SvROK(ST(arg_idx))) {
6020          svtype t = SvTYPE(SvRV(ST(arg_idx)));
6021          if (t == SVt_PVAV) {
6022                   x_sv = ST(arg_idx++);
6023          } else if (t == SVt_PVHV) {
6024                   h_sv = ST(arg_idx++);          /* hash-of-arrays shortcut */
6025          }
6026        }
6027        if (!h_sv && arg_idx < items
6028                     && SvROK(ST(arg_idx))
6029                     && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
6030          g_sv = ST(arg_idx++);
6031        }
6032        // 2. Parse named arguments (fallback)
6033        for (; arg_idx < items; arg_idx += 2) {
6034          const char *restrict key = SvPV_nolen(ST(arg_idx));
6035          SV         *restrict val = ST(arg_idx + 1);
6036          if      (strEQ(key, "x")) x_sv = val;
6037          else if (strEQ(key, "g")) g_sv = val;
6038          else if (strEQ(key, "h")) h_sv = val;
6039          else croak("kruskal_test: unknown argument '%s'", key);
6040        }
6041        // 3. Mutual-exclusion guard
6042        if (h_sv && (x_sv || g_sv))
6043          croak("kruskal_test: cannot mix 'h' (hash-of-arrays) with 'x'/'g' inputs");
6044
6045        /* ------------------------------------------------------------------ */
6046        /* Shared state filled by whichever input branch runs                 */
6047        /* ------------------------------------------------------------------ */
6048        RankInfo *restrict ri = NULL;
6049        char **restrict group_names = NULL; /* Track names to build group_stats */
6050        size_t valid_n = 0;
6051        size_t k       = 0;
6052
6053        /* ------------------------------------------------------------------ */
6054        /* 4a. Hash-of-arrays input path                                      */
6055        /*     my %x = ( group1 => [...], group2 => [...], ... )              */
6056        /* ------------------------------------------------------------------ */
6057        if (h_sv) {
6058          if (!SvROK(h_sv) || SvTYPE(SvRV(h_sv)) != SVt_PVHV)
6059                   croak("kruskal_test: 'h' must be a HASH reference");
6060          HV *restrict h_hv = (HV*)SvRV(h_sv);
6061          // First pass – validate values and tally total elements
6062          size_t total = 0;
6063          hv_iterinit(h_hv);
6064          HE *restrict he;
6065          while ((he = hv_iternext(h_hv))) {
6066                   SV *restrict val = HeVAL(he);
6067                   if (!SvROK(val) || SvTYPE(SvRV(val)) != SVt_PVAV)
6068                       croak("kruskal_test: every value in 'h' must be an ARRAY reference");
6069                   total += (size_t)(av_len((AV*)SvRV(val)) + 1);
6070          }
6071          if (total < 2) croak("not enough observations");
6072
6073          ri = (RankInfo *)safemalloc(total * sizeof(RankInfo));
6074          size_t num_keys = HvKEYS(h_hv);
6075          group_names = (char **)safecalloc(num_keys, sizeof(char*));
6076          /* Second pass – fill ri[], assigning one group_id per hash key */
6077          size_t group_id = 0;
6078          hv_iterinit(h_hv);
6079          while ((he = hv_iternext(h_hv))) {
6080                   STRLEN klen;
6081                   const char *restrict key_str = HePV(he, klen);
6082                   group_names[group_id] = savepvn(key_str, klen); // Save string key
6083
6084                   AV *restrict av  = (AV*)SvRV(HeVAL(he));
6085                   size_t       n_g = (size_t)(av_len(av) + 1);
6086                   for (size_t i = 0; i < n_g; i++) {
6087                       SV **restrict el = av_fetch(av, i, 0);
6088                       if (el && SvOK(*el) && looks_like_number(*el)) {
6089                           ri[valid_n].val = SvNV(*el);
6090                           ri[valid_n].idx = group_id;   /* group identity */
6091                           valid_n++;
6092                       }
6093                   }
6094                   group_id++;
6095          }
6096          k = group_id;   /* number of unique groups = number of hash keys */
6097
6098        /* ------------------------------------------------------------------ */
6099        /* 4b. Original x / g array-pair input path                           */
6100        /* ------------------------------------------------------------------ */
6101        } else {
6102          if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
6103                   croak("kruskal_test: 'x' is a required argument and must be an ARRAY reference");
6104          if (!g_sv || !SvROK(g_sv) || SvTYPE(SvRV(g_sv)) != SVt_PVAV)
6105                   croak("kruskal_test: 'g' is a required argument and must be an ARRAY reference");
6106
6107          AV *restrict x_av = (AV*)SvRV(x_sv);
6108          AV *restrict g_av = (AV*)SvRV(g_sv);
6109          size_t nx = (size_t)(av_len(x_av) + 1);
6110          size_t ng = (size_t)(av_len(g_av) + 1);
6111          if (nx != ng) croak("kruskal_test: 'x' and 'g' must have the same length");
6112          if (nx < 2)   croak("not enough observations");
6113
6114          ri = (RankInfo *)safemalloc(nx * sizeof(RankInfo));
6115          group_names = (char **)safecalloc(nx, sizeof(char*)); // Upper bound
6116
6117          // Map string group names → contiguous integer IDs
6118          HV *restrict group_map    = newHV();
6119          size_t          next_group_id = 0;
6120
6121          for (size_t i = 0; i < nx; i++) {
6122                   SV **restrict x_el = av_fetch(x_av, i, 0);
6123                   SV **restrict g_el = av_fetch(g_av, i, 0);
6124                   if (x_el && SvOK(*x_el) && looks_like_number(*x_el)
6125                            && g_el && SvOK(*g_el)) {
6126                       const char *restrict g_str = SvPV_nolen(*g_el);
6127                       STRLEN               glen  = strlen(g_str);
6128                       SV   **restrict id_sv = hv_fetch(group_map, g_str, glen, 0);
6129                       size_t group_id;
6130                       if (id_sv) {
6131                           group_id = SvUV(*id_sv);
6132                       } else {
6133                           group_id = next_group_id++;
6134                           hv_store(group_map, g_str, glen, newSVuv(group_id), 0);
6135                           group_names[group_id] = savepvn(g_str, glen); // Save string key
6136                       }
6137                       ri[valid_n].val = SvNV(*x_el);
6138                       ri[valid_n].idx = group_id;
6139                       valid_n++;
6140                   }
6141          }
6142          k = next_group_id;
6143          SvREFCNT_dec(group_map);
6144        }
6145
6146        /* ------------------------------------------------------------------ */
6147        /* 5. Shared post-extraction validation                               */
6148        /* ------------------------------------------------------------------ */
6149        if (valid_n < 2 || k < 2) {
6150          Safefree(ri);
6151          if (group_names) {
6152                   for (size_t i = 0; i < k; i++) { if (group_names[i]) Safefree(group_names[i]); }
6153                   Safefree(group_names);
6154          }
6155          if (valid_n < 2) croak("not enough observations");
6156          croak("all observations are in the same group");
6157        }
6158
6159        // 6. Ranking and Tie Accumulation (Reusing LikeR Helper)
6160        bool   has_ties = 0;
6161        double tie_adj  = rank_and_count_ties(ri, valid_n, &has_ties);
6162
6163        // 7. Aggregate Sum of Ranks AND Actual Values by Group
6164        double *restrict group_rank_sums = (double *)safecalloc(k, sizeof(double));
6165        double *restrict group_val_sums  = (double *)safecalloc(k, sizeof(double)); // For Mean
6166        size_t *restrict group_counts    = (size_t *)safecalloc(k, sizeof(size_t));
6167
6168        for (size_t i = 0; i < valid_n; i++) {
6169          size_t g_id = ri[i].idx;
6170          group_rank_sums[g_id] += ri[i].rank;
6171          group_val_sums[g_id]  += ri[i].val;
6172          group_counts[g_id]++;
6173        }
6174
6175        // 8. Calculate STATISTIC
6176        double stat_base = 0.0;
6177        for (size_t i = 0; i < k; i++) {
6178          if (group_counts[i] > 0)
6179                   stat_base += (group_rank_sums[i] * group_rank_sums[i])
6180                                / (double)group_counts[i];
6181        }
6182
6183        double n_d  = (double)valid_n;
6184        double stat = (12.0 * stat_base / (n_d * (n_d + 1.0))) - 3.0 * (n_d + 1.0);
6185        if (tie_adj > 0.0) {
6186          double tie_denom = 1.0 - (tie_adj / (n_d * n_d * n_d - n_d));
6187          stat /= tie_denom;
6188        }
6189        int    df    = (int)k - 1;
6190        double p_val = get_p_value(stat, df);
6191
6192        // 9. Return structured data exactly like R's htest
6193        HV *restrict res = newHV();
6194        hv_stores(res, "statistic", newSVnv(stat));
6195        hv_stores(res, "parameter", newSViv(df));
6196        hv_stores(res, "p_value",   newSVnv(p_val));
6197        hv_stores(res, "p.value",   newSVnv(p_val));
6198        hv_stores(res, "method",    newSVpv("Kruskal-Wallis rank sum test", 0));
6199
6200        // 10. Build the group_stats hash
6201        HV *restrict group_stats = newHV();
6202        HV *restrict stats_mean  = newHV();
6203        HV *restrict stats_size  = newHV();
6204
6205        for (size_t i = 0; i < k; i++) {
6206          if (group_counts[i] > 0 && group_names[i]) {
6207                   double mean = group_val_sums[i] / (double)group_counts[i];
6208                   size_t nlen = strlen(group_names[i]);
6209
6210                   hv_store(stats_mean, group_names[i], nlen, newSVnv(mean), 0);
6211                   hv_store(stats_size, group_names[i], nlen, newSVuv(group_counts[i]), 0);
6212          }
6213          if (group_names[i]) Safefree(group_names[i]); // Clean up name copy
6214        }
6215
6216        // Embed the nested hashes
6217        hv_stores(group_stats, "mean", newRV_noinc((SV*)stats_mean));
6218        hv_stores(group_stats, "size", newRV_noinc((SV*)stats_size));
6219        hv_stores(res, "group_stats",  newRV_noinc((SV*)group_stats));
6220
6221        // Memory Cleanup
6222        Safefree(group_names);    Safefree(group_rank_sums);
6223        Safefree(group_val_sums); Safefree(group_counts);
6224        Safefree(ri);
6225
6226        RETVAL = newRV_noinc((SV*)res);
6227}
6228OUTPUT:
6229    RETVAL
6230
6231SV* var_test(...)
6232CODE:
6233{
6234        SV* restrict x_sv = NULL;
6235        SV* restrict y_sv = NULL;
6236        double ratio = 1.0, conf_level = 0.95;
6237        const char* restrict alternative = "two.sided";
6238        unsigned int arg_idx = 0;
6239
6240        // 1. Shift positional argument 'x' if it's an array reference
6241        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
6242          x_sv = ST(arg_idx);
6243          arg_idx++;
6244        }
6245
6246        // 2. Shift positional argument 'y' if it's an array reference
6247        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
6248          y_sv = ST(arg_idx);
6249          arg_idx++;
6250        }
6251
6252        // Ensure the remaining arguments form complete key-value pairs
6253        if ((items - arg_idx) % 2 != 0) {
6254          croak("Usage: var_test(\\@x, \\@y, key => value, ...)");
6255        }
6256
6257        // --- Parse named arguments from the remaining flat stack ---
6258        for (; arg_idx < items; arg_idx += 2) {
6259          const char* restrict key = SvPV_nolen(ST(arg_idx));
6260          SV* restrict val = ST(arg_idx + 1);
6261
6262          if      (strEQ(key, "x"))           x_sv        = val;
6263          else if (strEQ(key, "y"))           y_sv        = val;
6264          else if (strEQ(key, "ratio"))       ratio       = SvNV(val);
6265          else if (strEQ(key, "conf_level") || strEQ(key, "conf.level")) conf_level = SvNV(val);
6266          else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
6267          else croak("var_test: unknown argument '%s'", key);
6268        }
6269
6270        // --- Validate required inputs / types ---
6271        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
6272          croak("var_test: 'x' is a required argument and must be an ARRAY reference");
6273        if (!y_sv || !SvROK(y_sv) || SvTYPE(SvRV(y_sv)) != SVt_PVAV)
6274          croak("var_test: 'y' is a required argument and must be an ARRAY reference");
6275
6276        if (ratio <= 0.0 || !isfinite(ratio))
6277          croak("var_test: 'ratio' must be a single positive number");
6278        if (conf_level <= 0.0 || conf_level >= 1.0 || !isfinite(conf_level))
6279          croak("var_test: 'conf.level' must be a single number between 0 and 1");
6280
6281        AV* restrict x_av = (AV*)SvRV(x_sv);
6282        AV* restrict y_av = (AV*)SvRV(y_sv);
6283        size_t nx_raw = av_len(x_av) + 1;
6284        size_t ny_raw = av_len(y_av) + 1;
6285
6286        // --- Computation via Welford's Algorithm (ignoring NaNs) ---
6287        double mean_x = 0.0, M2_x = 0.0;
6288        size_t nx = 0;
6289        for (size_t i = 0; i < nx_raw; i++) {
6290                SV** restrict tv = av_fetch(x_av, i, 0);
6291                if (tv && SvOK(*tv) && looks_like_number(*tv)) {
6292                        double val = SvNV(*tv);
6293                        if (!isnan(val) && isfinite(val)) {
6294                                nx++;
6295                                double delta = val - mean_x;
6296                                mean_x += delta / nx;
6297                                M2_x += delta * (val - mean_x);
6298                        }
6299                }
6300        }
6301
6302        double mean_y = 0.0, M2_y = 0.0;
6303        size_t ny = 0;
6304        for (size_t i = 0; i < ny_raw; i++) {
6305          SV** restrict tv = av_fetch(y_av, i, 0);
6306          if (tv && SvOK(*tv) && looks_like_number(*tv)) {
6307                   double val = SvNV(*tv);
6308                   if (!isnan(val) && isfinite(val)) {
6309                       ny++;
6310                       double delta = val - mean_y;
6311                       mean_y += delta / ny;
6312                       M2_y += delta * (val - mean_y);
6313                   }
6314          }
6315        }
6316
6317        if (nx < 2) croak("not enough 'x' observations");
6318        if (ny < 2) croak("not enough 'y' observations");
6319
6320        double df_x = (double)(nx - 1);
6321        double df_y = (double)(ny - 1);
6322        double var_x = M2_x / df_x;
6323        double var_y = M2_y / df_y;
6324
6325        if (var_y == 0.0) croak("var_test: variance of 'y' is zero (cannot divide by zero)");
6326
6327        // --- Statistics Math ---
6328        double estimate = var_x / var_y;
6329        double statistic = estimate / ratio;
6330
6331        double p_val = pf(statistic, df_x, df_y);
6332        double ci_lower = 0.0, ci_upper = INFINITY;
6333
6334        if (strcmp(alternative, "less") == 0) {
6335          ci_upper = estimate / qf_bisection(1.0 - conf_level, df_x, df_y);
6336        } else if (strcmp(alternative, "greater") == 0) {
6337          p_val = 1.0 - p_val;
6338          ci_lower = estimate / qf_bisection(conf_level, df_x, df_y);
6339        } else {
6340          // two.sided
6341          double p1 = p_val;
6342          double p2 = 1.0 - p_val;
6343          p_val = 2.0 * (p1 < p2 ? p1 : p2);
6344
6345          double beta = (1.0 - conf_level) / 2.0;
6346          ci_lower = estimate / qf_bisection(1.0 - beta, df_x, df_y);
6347          ci_upper = estimate / qf_bisection(beta, df_x, df_y);
6348        }
6349
6350        // --- Pack Results ---
6351        HV* restrict results = newHV();
6352        hv_store(results, "statistic", 9, newSVnv(statistic), 0);
6353
6354        AV* restrict param_av = newAV();
6355        av_push(param_av, newSVnv(df_x));
6356        av_push(param_av, newSVnv(df_y));
6357        hv_store(results, "parameter", 9, newRV_noinc((SV*)param_av), 0);
6358
6359        hv_store(results, "p_value", 7, newSVnv(p_val), 0);
6360
6361        AV* restrict conf_int = newAV();
6362        av_push(conf_int, newSVnv(ci_lower));
6363        av_push(conf_int, newSVnv(ci_upper));
6364        hv_store(results, "conf_int", 8, newRV_noinc((SV*)conf_int), 0);
6365
6366        hv_store(results, "estimate", 8, newSVnv(estimate), 0);
6367        hv_store(results, "null_value", 10, newSVnv(ratio), 0);
6368        hv_store(results, "alternative", 11, newSVpv(alternative, 0), 0);
6369        hv_store(results, "method", 6, newSVpv("F test to compare two variances", 0), 0);
6370
6371        RETVAL = newRV_noinc((SV*)results);
6372}
6373OUTPUT:
6374    RETVAL
6375
6376SV *sample(ref, n = 1)
6377    SV *ref
6378    IV n
6379PREINIT:
6380    SV *restrict ret = &PL_sv_undef;
6381CODE:
6382        if (!PL_srand_called) {
6383          (void)seedDrand01((Rand_seed_t)Perl_seed(aTHX));
6384          PL_srand_called = TRUE;
6385        }
6386        if (n < 0) n = 0;
6387        if (SvROK(ref)) {
6388                SV *restrict rv = SvRV(ref);
6389                /* --- HASH REFERENCE --- */
6390                if (SvTYPE(rv) == SVt_PVHV) {
6391                        HV *restrict hv    = (HV *)rv;
6392                        I32 count = hv_iterinit(hv);
6393                        I32 limit = (n < (IV)count) ? (I32)n : count;
6394                        HV *restrict ret_hv = newHV();
6395
6396                        if (count > 0 && limit > 0) {
6397                                 HE **restrict entries;
6398                                 HE  *restrict entry;
6399                                 unsigned i;
6400
6401                                 Newx(entries, count, HE *);
6402
6403                                 /* Collect all HE pointers in one pass */
6404                                 i = 0;
6405                                 while ((entry = hv_iternext(hv)))
6406                                     entries[i++] = entry;
6407
6408                                 /* Partial Fisher-Yates (only 'limit' passes) */
6409                                 for (i = 0; i < limit; i++) {
6410                                     I32 j    = i + (I32)(Drand01() * (count - i));
6411                                     HE *restrict tmp  = entries[i];
6412                                     entries[i] = entries[j];
6413                                     entries[j] = tmp;
6414                                 }
6415
6416                                 /* Pre-size result hash to avoid rehashing during population */
6417                                 hv_ksplit(ret_hv, limit);
6418
6419                                 for (i = 0; i < limit; i++) {
6420                                     HEK *restrict hek = HeKEY_hek(entries[i]);
6421                                     /*
6422                                      * hv_store() with a precomputed hash skips the hash
6423                                      * computation entirely.  Negative klen signals UTF-8.
6424                                      */
6425                                     (void)hv_store(
6426                                         ret_hv,
6427                                         HEK_KEY(hek),
6428                                         HEK_UTF8(hek) ? -(I32)HEK_LEN(hek) : (I32)HEK_LEN(hek),
6429                                         SvREFCNT_inc(HeVAL(entries[i])),  /* HeVAL: direct macro, no call */
6430                                         HeHASH(entries[i])                /* reuse precomputed hash */
6431                                     );
6432                                 }
6433                                 Safefree(entries);
6434                        }
6435                        ret = newRV_noinc((SV *)ret_hv);
6436                } else if (SvTYPE(rv) == SVt_PVAV) {/* --- ARRAY REFERENCE --- */
6437                        AV    *restrict av    = (AV *)rv;
6438                        size_t count = av_top_index(av) + 1;  /* signed; 0 for empty AV */
6439                        size_t limit = (n < count) ? (size_t)n : count;
6440                        AV    *restrict ret_av = newAV();
6441
6442                        /* Pre-allocate the result array to avoid incremental reallocs */
6443                        if (n > 0)
6444                                 av_extend(ret_av, (size_t)n - 1);
6445
6446                        if (count > 0) {
6447                                 SV    **restrict src = AvARRAY(av);   /* direct pointer into AV's C array */
6448                                 size_t *restrict idx;
6449                                 size_t  i;
6450
6451                                 /* Shuffle indices rather than SV** to keep the original AV intact */
6452                                 Newx(idx, count, size_t);
6453                                 for (i = 0; i < count; i++)
6454                                     idx[i] = i;
6455
6456                                 /* Partial Fisher-Yates on the index array */
6457                                 for (i = 0; i < limit; i++) {
6458                                     size_t j   = i + (size_t)(Drand01() * (count - i));
6459                                     size_t tmp = idx[i];
6460                                     idx[i]  = idx[j];
6461                                     idx[j]  = tmp;
6462                                 }
6463
6464                                 for (i = 0; i < (size_t)n; i++) {
6465                                     if (i < limit) {
6466                                         SV *restrict sv = src[idx[i]];   /* AvARRAY direct access — no av_fetch call */
6467                                         SV *push_sv;
6468                                                        if (sv && sv != &PL_sv_undef)
6469                                                                 push_sv = SvREFCNT_inc(sv);
6470                                                        else
6471                                                                 push_sv = newSV(0);
6472                                                        av_push(ret_av, push_sv);
6473                                     } else {
6474                                         av_push(ret_av, newSV(0));
6475                                     }
6476                                 }
6477                                 Safefree(idx);
6478                        } else {
6479                                for (size_t i = 0; i < (size_t)n; i++)
6480                                    av_push(ret_av, newSV(0));
6481                        }
6482                        ret = newRV_noinc((SV *)ret_av);
6483                }
6484        }
6485        RETVAL = ret;
6486OUTPUT:
6487    RETVAL