/*
 * STAN - Stream Analyser
 * Copyright (c) 2001 Konrad Rieck <kr@roqe.org>
 * The Roqefellaz, http://www.roqe.org/stan
 *
 * Here you'll find all the mathematical stuff. In general it is simple, but
 * remember that all sums have to be build over the nodes of the treap and
 * therefore it is necessary to traverse the treap.
 * $Id: stats.c,v 1.10 2001/04/10 10:56:53 kr Exp $
 */

#include <stdio.h>
#include <ctype.h>
#include <math.h>

#include <stan.h>
#include <treap.h>
#include <pattern.h>
#include <data.h>
#include <stats.h>
#include <config.h>

extern tnode_t **treap;

int sccfirst;
double mean, median, variance, deviation, chi_squared, entropy, scc,
    sum, total_size, tmp_median, scc, sccun, sccu0, scclast,
    scct1, scct2, scct3;

void print_header_patterns(size_t patlen)
{
    printf("Pattern length %d, different %d, total %d, "
	   "bytes %d, depth %d\n",
	   patlen, tsize_tnodes(&treap[patlen - 1]),
	   tsize_total(&treap[patlen - 1]), size_data(),
	   tdepth(&treap[patlen - 1]));
}

void print_range_patterns(size_t patlen)
{
    printf(" - Pattern range\n");
    printf("   ");
    print_pattern(*tmin(&treap[patlen - 1]), patlen);

    printf(" - ");
    if (patlen > 6)
	printf("\n   ");

    print_pattern(*tmax(&treap[patlen - 1]), patlen);
    printf("\n");
}

void print_top_patterns(int number, size_t patlen)
{
    int i;

    printf(" - %d most used patterns\n", number);
    for (i = 0; i < number && treap[patlen - 1]; i++) {
	printf("   ");
	print_pattern(treap[patlen - 1], patlen);
	tdelete(&treap[patlen - 1], patlen);
	if ((patlen - 1 < 2 && i % 3 == 2) ||
	    (patlen - 1 >= 2 && patlen - 1 < 6 && i % 2 == 1) ||
	    (patlen - 1 >= 6) || !treap[patlen - 1] || i == number - 1)
	    printf("\n");
    }
}

void init_stats()
{
    sccfirst = TRUE;
    scct1 = scct2 = scct3 = 0.0;
    sum = 0;
    entropy = 0;
}

void calc_correlation(byte_t b)
{
    sccun = b;
    if (sccfirst) {
	sccfirst = FALSE;
	scclast = 0;
	sccu0 = sccun;
    } else {
	scct1 = scct1 + scclast * sccun;
    }
    scct2 = scct2 + sccun;
    scct3 = scct3 + (sccun * sccun);
    scclast = sccun;
}

void complete_correlation()
{
    scct1 = scct1 + scclast * sccu0;
    scct2 = scct2 * scct2;
    scc = (double) tsize_total(&treap[0]) * scct3 - scct2;
    if (scc != 0) {
	scc = ((double) tsize_total(&treap[0]) * scct1 - scct2) / scc;
    }
}

void calc_sum(tnode_t * tnode)
{
    sum += (double) tnode->pattern[0] * tnode->count;
}

void find_median(tnode_t * tnode)
{
    if (sum < total_size / 2) {
	tmp_median = (double) tnode->pattern[0];
	sum += tnode->count;
    }
}

void calc_mean()
{
    sum = 0;
    ttraverse(&treap[0], calc_sum, inorder);
    mean = sum / (double) tsize_total(&treap[0]);
}


void calc_median()
{
    sum = 0;
    total_size = tsize_total(&treap[0]);
    ttraverse(&treap[0], find_median, inorder);
    median = tmp_median;
}

void calc_variance_sum(tnode_t * tnode)
{
    sum += pow(mean - (double) tnode->pattern[0], 2) *
	(double) tnode->count;
}

void calc_chi_squared_sum(tnode_t * tnode)
{
    sum +=
	pow(tnode->count - (tsize_total(&treap[0]) / (double) BYTE_MAX),
	    2) / (tsize_total(&treap[0]) / (double) BYTE_MAX);
}

void calc_chi_squared()
{
    sum = 0;
    ttraverse(&treap[0], calc_chi_squared_sum, inorder);
    chi_squared = sum;
}

void calc_variance()
{
    sum = 0;
    ttraverse(&treap[0], calc_variance_sum, inorder);
    variance = sum / (double) tsize_total(&treap[0]);
}

void calc_deviation()
{
    deviation = sqrt(variance);
}

void calc_ent(tnode_t * tnode)
{
    double prob;
    if (tnode->count > 0) {
	prob = tnode->count / (double) tsize_total(&treap[0]);
	entropy += prob * (log10(1 / prob) * log2of10);
    }
}

void calc_entropy()
{
    entropy = 0;
    ttraverse(&treap[0], calc_ent, inorder);
}

void print_stats()
{
    printf("General statistics for the stream, bytes %d\n", size_data());

    calc_mean();
    printf("   Arithmetic mean:  ");
    print_double(mean);
    printf("\n");

    calc_median();
    printf("   Median:           ");
    print_double(median);
    printf("\n");

    calc_variance();
    calc_deviation();
    printf("   Deviation:        ");
    print_double(deviation);
    printf("\n");

    calc_chi_squared();
    printf("   Chi-Square test:  ");
    print_double(chi_squared);
    printf("\n");

    calc_entropy();
    printf("   Entropy per byte: ");
    print_double(entropy);
    printf("\n");

    complete_correlation();
    printf("   Correlation co.:  ");
    print_double(scc);
    printf("\n\n");

}

void print_double(double d)
{
    printf("%15.6f", d);
    if (d > 0 && d <= BYTE_MAX && isprint((int) d)) {
	printf("  ~  0x%.2x", (int) d);
	printf("(%c)", (int) d);
    }
}
