#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#ifdef USE_THREADS
#include <pthread.h>
#else /* USE_THREADS */
#include <sys/wait.h>
#endif /* USE_THREADS */
#include <signal.h>
#include <errno.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netdb.h>
#include <sys/mman.h>
#include <assert.h>

#define NETPERF
#include "netlib3.h"

#define NETPERF_VERSION 3
#define NETPERF_UPDATE  0
#define NETPERF_FIX     0

/* seems that compilation on a stock HP-UX 11.00 system does not get
   these definitions - probably something I need to submit a bug for,
   or perhaps something I am not getting in my compile line? */

extern char *strtok_r(char *, const char *, char **);
extern char *strchr(const char *, int);

test_t *netlib3_test;

/* our global switch table. I wonder if there is an issue with forking
   and the entries remaining valid? */
FuncWithArg netperf_switch[TEST_LAST][NUM_FUNC];

/* globals for the confidence intervals */
uint32_t iteration_max;
uint32_t iteration_min;
int32_t  confidence_level;
double   interval;

/* globals for CPU utilization measurement */
uint32_t shell_num_cpus;
float    local_cpu_rate;
int32_t  local_cpu_usage;

/* control debugging output */
int32_t  debug;
FILE     *where;

/* synchronization stuff. we have to remember to allocate this out of
   a shared memory region for the non-threaded case! */
barrier_t *netperf_barrier;
barrier_t *netperf_stop_barrier;

void
unknown_function(test_t *test)
{
  fprintf(where,
	  "An attempt was made to run a test that is not compiled\n"\
	  "into netperf, or is not being properly initialized.\n"\
	  "Please check that the proper compiler flags were set.\n");
  exit(-1);
}


 /* this routine will allocate a circular list of buffers for either */
 /* send or receive operations. each of these buffers will be aligned */
 /* and offset as per the users request. the circumference of this */
 /* ring will be controlled by the setting of send_width. the buffers */
 /* will be filled with data from the file specified in fill_file. if */
 /* fill_file is an empty string, the buffers will not be filled with */
 /* any particular data */

ring_elt_t *
allocate_buffer_ring(int32_t width,
		     int32_t buffer_size,
		     int32_t alignment,
		     int32_t offset,
		     char *fill_file)
{

  ring_elt_t *first_link = NULL;
  ring_elt_t *temp_link  = NULL;
  ring_elt_t *prev_link;

  int i;
  int malloc_size;
  int bytes_left;
  int bytes_read;
  int do_fill;

  FILE *fill_source;

  if (debug) {
    fprintf(where,
	    "Asked to allocate a buffer ring with %d %d byte buffers\n",
	    width,
	    buffer_size);
    fflush(where);
  }

  malloc_size = buffer_size + alignment + offset;

  /* did the user wish to have the buffers pre-filled with data from a */
  /* particular source? */
  if (strcmp(fill_file,"") == 0) {
    do_fill = 0;
    fill_source = NULL;
  }
  else {
    do_fill = 1;
    fill_source = (FILE *)fopen(fill_file,"r");
    if (fill_source == (FILE *)NULL) {
      perror("Could not open requested fill file");
      exit(1);
    }
  }

  prev_link = NULL;
  for (i = 1; i <= width; i++) {
    /* get the ring element */
    temp_link = (ring_elt_t *)malloc(sizeof(ring_elt_t));
    /* remember the first one so we can close the ring at the end */
    if (i == 1) {
      first_link = temp_link;
    }
    temp_link->buffer_base = (char *)malloc(malloc_size);
    temp_link->buffer_ptr = (char *)(( (long)(temp_link->buffer_base) + 
			  (long)alignment - 1) &	
			 ~((long)alignment - 1));
    temp_link->buffer_ptr += offset;
    /* is where the buffer fill code goes. */
    if (do_fill) {
      bytes_left = buffer_size;
      while (bytes_left) {
	if (((bytes_read = fread(temp_link->buffer_ptr,
				 1,
				 bytes_left,
				 fill_source)) == 0) &&
	    (feof(fill_source))){
	  rewind(fill_source);
	}
	bytes_left -= bytes_read;
      }
    }
    temp_link->next = prev_link;
    prev_link = temp_link;
  }
  first_link->next = temp_link;

  return(first_link); /* it's a circle, doesn't matter which we return */
}

#ifdef USE_THREADS
void
timer_func(test_t *test)
{
  /* this is really rather simple, we go to sleep for the specified
     number of seconds, and when that time has expired, we set the
     test->times_up variable. I suppose I could generalize this a bit
     more and define a structure that takes a time in seconds, and a
     pointer to a location to set to one upon expiration, but that can
     wait. right now, all it has to do is work for a timer being
     started in a threadchild of netserver. the netperf side of things
     has the main thread doing the timing for the tests, but since
     netserver may be running several, unrelated tests, there has to
     be an "independent" timer raj 2/98 */
  sleep(test->test_length);
  test->times_up = 1;
printf("timer expired for test %p\n",test);
  /* I wonder if I need to do anything in particular here? */
}
#else /* USE_THREADS */
void
timer_func(int sig)
{
  /* really simple, in theory :) assuming that netlib3_test is set
     right? */

  if (debug) {
    fprintf(where,
	    "Timer_func popped in pid %d\n",getpid());
    fflush(where);
  }

  netlib3_test->times_up = 1;
}
#endif /* USE_THREADS */

void
start_timer(test_t *test)
{
#ifdef USE_THREADS

  pthread_t tid;

  pthread_attr_t attr;

  int ret_val;

  /* just start the timer thread (I really wish they had defined some
     per-thread timer/signal functions, this business about creating
     extra threads seems rather silly). the thread probably needs to
     be "detatched" since no one is going to bother to reap it. */

  ret_val = pthread_create(&tid,
			   (pthread_attr_t *)NULL,
			   (void *(*)())timer_func,
			   test);
  if (ret_val != 0) {
    fprintf(where,
	    "start_timer: pthread_create failled with %d\n",ret_val);
    exit(-1);
  }

  ret_val = pthread_detach(tid);
  if (ret_val != 0) {
    fprintf(where,
	    "start_timer: pthread_detach failled with %d\n",ret_val);
    exit(-1);
  }

  if (debug) {
    fprintf(where,
	    "Thread %d started a %d second timer\n",
	    test->thread_num,
	    test->test_length);
    fflush(where);
  }

#else /* USE_THREADS */

  struct sigaction action;
  action.sa_handler = timer_func;
  action.sa_flags = SA_RESTART;
  sigemptyset(&(action.sa_mask));
  sigaddset(&(action.sa_mask),SIGALRM);

  if (sigaction(SIGALRM, &action, NULL) < 0) {
    fprintf(where,"start_timer: error installing alarm handler ");
    fprintf(where,"errno %d\n",errno);
    exit(-1);
  }

  netlib3_test = test;

  /* this is the easy case - just set the timer for so many seconds */ 
  if (alarm(test->test_length) != 0) {
    fprintf(where,
	    "error starting alarm timer, errno %d\n",
	    errno);
    exit(-1);
  }

  if (debug) {
    fprintf(where,
	    "Thread %d (pid %d) starting a %d second timer\n",
	    test->thread_num,getpid(),test->test_length);  
    fflush(where);
  }

#endif /* USE_THREADS */
}

void
start_worker(test_t *test, void *start_func(), void *start_arg, int detached)
{

  int ret_val;

#ifndef USE_THREADS
  pid_t tmp_pid;
#endif /* USE_THREADS */

  if (debug) {
    fprintf(where,
	    "start_worker called with test %x func %x and arg %x\n",
	    test,start_func,start_arg);
    fflush(where);
  }

#ifdef USE_THREADS

  ret_val =  pthread_create(&(test->tid),
			    (pthread_attr_t *)NULL,
			    start_func,
			    start_arg);
  if (ret_val != 0) {
    printf("start_worker: pthread_create failled with %d\n",ret_val);
    exit(-1);
  }

  /* netserver will request that the threads be detached - there is no
     synchronization between threads in the netserver side, so why
     need to call pthread_join? */
  if (detached) {
    ret_val = pthread_detach(test->tid);
    
    if (ret_val != 0) {
      fprintf(where,
	      "start_worker: pthread_detach failed with %d\n",ret_val);
      fflush(where);
    }
  }

#else /* USE_THREADS */

  /* if we are not using threads, we must be using processes. stick a
     fork in here and see if we are done :) I used to assigne the
     return value of fork to test->tid directly, but that is actually
     a race condition between the parent and child since the test_t
     struct is in shared memory. so, just have the parent store-away
     the pid in test->tid. raj 2/98 */

  tmp_pid = fork();
  
  if (tmp_pid == 0) {
    /* we are the child - run the function. upon return of the
       function, just call exit? */
    test->tid = getpid(); /* yeah, I could just say tmp_pid */
    start_func(start_arg);
    exit(0);
  }
  else if (tmp_pid == -1) {
    perror("start_worker: fork failled");
    exit(-1);
  }
  /* we fall through to here if we were the parent, in which case we
     just return */ 

#endif /* USE_THREADS */
}

void
wait_for_worker(test_t *test)
{
#ifdef USE_THREADS
  int ret_val;
#else /* USE_THREADS */
  pid_t ret_val;
#endif /* USE_THREADS */

  if (debug) {
    fprintf(where,
	    "Waiting for thread %d to complete\n",test->thread_num);
    fflush(where);
  }

#ifdef USE_THREADS
  /* we don't care about the return pointer really */
  ret_val =pthread_join(test->tid,NULL);

  if (debug) {
    if (ret_val != 0) {
      fprintf(where,
	      "Unable to join with thread %d - status %d\n",
	      test->thread_num,
	      ret_val);
      fflush(where);
      exit(-1);
    }

    fprintf(where,
	    "Joined with thread %d\n",test->thread_num);
  }
#else /* USE_THREADS */

  ret_val = waitpid(test->tid,NULL,0);

  if (debug) {
    if (ret_val != test->tid) {
      fprintf(where,
	      "Could not join with thread %d (%d), waitpid returned %d errno %d\n",
	      test->thread_num,
	      test->tid,
	      ret_val,
	      errno);
      fflush(where);
    }
  }
#endif /* USE_THREADS */
}

void
wait_for_workers(test_t *test)
{
  if (debug) {
    fprintf(where,
	    "waiting for the worker threads to complete\n");
    fflush(where);
  }
  
  while (test != NULL) {
    wait_for_worker(test);
    test = test->next;
  }
}
/* I may want to expand the args at some point */
void
barrier_catcher(int sig)
{
  /* there really isn't much of anything to do, we just want a routine
     to catch the SIGUSR1 that will be used for the multiple process
     barrier construct */
  if (debug) {
    fprintf(where,
	    "Pid %d caught a barrier signal!\n",getpid());
    fflush(where);
  }
  return;
}

void
netlib3_init(void)
{
  int i,j;

#ifndef USE_THREADS
  struct sigaction my_action;
#endif /* USE_THREADS */

  where = stdout;

#ifndef USE_THREADS
  if (debug) {
    fprintf(where,
	    "installing signal handler for SIGUSR1\n");
    fflush(where);
  }
  sigemptyset(&(my_action.sa_mask));
  my_action.sa_handler = barrier_catcher;
  my_action.sa_flags = 0;

  if (sigaction(SIGUSR1,
		&my_action,
		NULL) != 0) {
    fprintf(where,
	    "Could not install the barrier signal catcher: errno %d\n",
	    errno);
    exit(-1);
  }
  

  /* we want to ignore SIGCHLD  */  
  my_action.sa_handler = SIG_IGN;
  if (sigaction(SIGCHLD,
		&my_action,
		NULL) != 0) {
    fprintf(where,
	    "Could not ignore SIGCHLD: errno %d\n",
	    errno);
    exit(-1);
  }

#endif /* USE_THREADS */

  if (debug) {
    fprintf(where,"Initializing the netperf_switch\n");
    fflush(where);
  }

  for (i = 0; i < TEST_LAST; i++)
    for (j = 0; j < NUM_FUNC; j++) 
      netperf_switch[i][j] = unknown_function;

  
#ifdef USE_THREADS
  /* we aught to check that this was succesful...*/
  netperf_barrier = (barrier_t *)malloc(sizeof(barrier_t));
#else /* USE_THREADS */
  netperf_barrier = allocate_shared_memory(sizeof(barrier_t),
					   "/tmp/netperf_barrier");
#endif /* USE_THREADS */

  if (debug) {
    fprintf(where,"netperf_barrier allocated at %x\n",netperf_barrier);
    fflush(where);
  }

#ifdef USE_THREADS
  /* we aught to check that this was succesful...*/
  netperf_stop_barrier = (barrier_t *)malloc(sizeof(barrier_t));
#else /* USE_THREADS */
  netperf_stop_barrier = allocate_shared_memory(sizeof(barrier_t),
						"/tmp/netperf_barrier");
#endif /* USE_THREADS */

  if (debug) {
    fprintf(where,
	    "netperf_stop_ barrier allocated at %x\n",
	    netperf_stop_barrier); 
    fprintf(where,
	    "requesting that the bsd tests init their switch elts\n");
    fflush(where);
  }

  /* here we have a bunch of #ifdef's to call the various test suites
     init functions - mostly, they just add their routines to the
     switch table  */

  nettest3_bsd_init();

#ifdef DO_DNS
  if (debug) {
    fprintf(where,
	    "requesting that the dns tests init their switch elts\n");
    fflush(where);
  }

  /* here we have a bunch of #ifdef's to call the various test suites
     init functions - mostly, they just add their routines to the
     switch table  */

  nettest3_dns_init();
#endif /* DO_DNS */   

#ifdef DO_FTP
  if (debug) {
    fprintf(where,
	    "requesting that the ftp tests init their switch elts\n");
    fflush(where);
  }

  nettest3_ftp_init();
#endif /* DO_FTP */   

}


void
init_test_globals(test_t *test)
{
  int i;
  test->next = NULL;
  test->thread_num = -1;
  test->argc = 0;
  for (i = 0; i < NETPERF_ARGS_MAX; i++) {
    test->argv[i] = NULL;
  }

  for (i = 0; i < NETPERF_HOST_MAX; i++)
    test->remote_host[i] = '\0';

  test->control_port = DEFAULT_CONTROL_PORT;
  test->control_sock = -1;

  test->times_up = 0;
  test->test_length = 10;

  /* the default test type is TCP_STREAM */
  strcpy(test->test_name,"TCP_STREAM");

  test->local_send_align = 8;
  test->local_recv_align = 8;
  test->local_send_offset = 0;
  test->local_recv_offset = 0;
  test->remote_send_align = 8;
  test->remote_recv_align = 8;
  test->remote_send_offset = 0;
  test->remote_recv_offset = 0;

  test->send_width = 0;
  test->recv_width = 0;

  test->verbosity = 1;
  test->print_headers = 1;
  test->format_units = 'm';

#ifdef notdef
  /* at first, I was trying to put an interesting pattern into these
     areas, but it was more trouble than it was worth, so just zero
     them out - that way, if any of the test-specific fields are
     pointers, they will be NULL pointers, and if they are strings,
     they will be very short strings initially. */
  for (i = 0; i < NETPERF_MAX_SETTINGS; i++)
    test->test_specific_settings[i] = 0;

  for (i = 0; i < NETPERF_MAX_RESULTS; i++)
    test->test_specific_results[i] = 0;
#endif
}


void *
allocate_shared_memory(uint32_t size, char *file)
{
  int fd;

  void *temp_mmap_ptr;
  
  fd = open(file,O_RDWR | O_CREAT | O_EXCL);

  if (fd == -1) {
    fprintf(where,
	    "allocate_shared_memory: file creation; errno %d\n",errno);
    exit(-1);
  }
  
  if (chmod(file,0644) == -1) {
    fprintf(where,
	    "allocate_shared_memory: chmod; errno %d\n",errno);
    exit(-1);
  }
  
  /* with the file descriptor in place, lets be sure that the file is */
  /* large enough. */
  
  if (truncate(file,size) == -1) {
    fprintf(where,"allocate_shared_memory: truncate: errno %d\n",errno);
    exit(-1);
  }

  /* the file should be large enough now, so we can mmap it */
  
  /* if the system does not have MAP_VARIABLE, just define it to */
  /* be zero. it is only used/needed on HP-UX (?) raj 4/95 */
#ifndef MAP_VARIABLE
#define MAP_VARIABLE 0x0000
#endif /* MAP_VARIABLE */
#ifndef MAP_FILE
#define MAP_FILE 0x0000
#endif /* MAP_FILE */
  if ((temp_mmap_ptr = (long *)mmap((void *)NULL,
				    size,
				    PROT_READ | PROT_WRITE,
				    MAP_FILE | MAP_SHARED | MAP_VARIABLE,
				    fd,
				    0)) == (long *)-1) {
    fprintf(where,
	    "allocate_shared_memory: mmap: errno %d fd %d size %d\n",
	    errno,
	    fd,
	    size);
    exit(-1);
  }


  /* no need to keep the file descriptor open */
  close(fd);

  /* this way, we can be pretty much assured that it will go away when
     we exit. of course, if we are doing that, then why not simply
     create an anonymous memory region? probably, but MAP_ANONYMOUS
     appears in the HP-UX 11.00 manpages as an extension, so best we
     not rely on it. raj 2/98 */ 
  unlink(file);

  if (debug) {
    fprintf(where,
	    "returning %x as the base of the memory region\n",
	    temp_mmap_ptr);
    fflush(where);
  }

  return(temp_mmap_ptr);
}

/* the pthread versions of the following barrier manipulation routines
  come from pages 121-123 of "Threadtime" by Scott Norton and Mark
  Dipasquale */

/* Mutex to protect barrier initialization */
#ifdef USE_THREADS
pthread_mutex_t barrier_init_mutex = PTHREAD_MUTEX_INITIALIZER;
#endif /* USE_THREADS */

/* Barrier initialization */
int 
barrier_init(barrier_t *b, int val)
{
#ifdef USE_THREADS
  int     ret_val;

  /* Only allow one barrier init at a time to remove races. strictly
     speaking, this is not needed in netperf as only one thread will
     ever call barrier_init */
  ret_val = pthread_mutex_lock(&barrier_init_mutex);
  if (ret_val != 0)
    return(ret_val);

  /* Reinitiizaing the barrier count value? */
  if (b->valid == BARRIER_VALID) {
    /* Acquire the butex for the barrier */
    ret_val = pthread_mutex_lock(&b->mutex);
    if (ret_val != 0) {
      (void) pthread_mutex_unlock(&barrier_init_mutex);
      return(ret_val);
    }

    /* If the barrier is currently busy, return an error. */
    if (b->blocked_threads != 0) {
      (void) pthread_mutex_unlock(&b->mutex);
      (void) pthread_mutex_unlock(&barrier_init_mutex);
      return(EBUSY);
    }

    /* Reset the barrier count value and return. */
    b->barrier_val = val;
    ret_val = pthread_mutex_unlock(&b->mutex);
    if (ret_val != 0) {
      (void) pthread_mutex_unlock(&barrier_init_mutex);
      return(ret_val);
    }
  } else {
    /* Initializing a barrier from scratch. */
    ret_val = pthread_mutex_init(&b->mutex, NULL);
    if (ret_val != 0) {
      (void) pthread_mutex_unlock(&barrier_init_mutex);
      return(ret_val);
    }

    ret_val = pthread_cond_init(&b->cv, NULL);
    if (ret_val !=0) {
      (void) pthread_mutex_unlock(&barrier_init_mutex);
      return(ret_val);
    }

    b->barrier_val = val;
    b->blocked_threads = 0;
    b->predicate = 0;
    b->valid = BARRIER_VALID;
  }

  /* Release the lock and return. */
  ret_val = pthread_mutex_unlock(&barrier_init_mutex);
  if (ret_val !=0) {
    return(ret_val);
  }
#else /* USE_THREADS */

  msemaphore *returned_sem;

  int     ret_val;
  int32_t i;

  /* we are basically making the assumption that only the "main"
     netperf thread of execution */

  if (b->valid == BARRIER_VALID) {
    /* the very first bit of initialization has been done, we are just
       resetting the barrier count value */

    if (debug) {
      fprintf(where,"locking2 the semaphore at %x\n",&(b->semaphore));
      fflush(where);
    }

    /* grab the semaphore */
    ret_val = msem_lock(&(b->semaphore),0);
    if (ret_val != 0) {
      fprintf(where,
	      "Locking the barrier semaphore failled, errno %d\n",
	      errno);
      fflush(where);
      exit(-1);
    }

    /* there really should be no one here */
    if (b->blocked_threads != 0) {
      fprintf(where,
	      "Someone asked to init a busy barrier!");
      exit(-1);
    }

    /* ok, things are safe */
    b->barrier_val = val;

    ret_val = msem_unlock(&(b->semaphore),0);
    if (ret_val != 0) {
      fprintf(where,
	      "Unlocking the barrier semaphore failled: errno %d\n",
	      errno);
      fflush(where);
      exit(-1);
    }
  }
  else {
    /* we are starting from scratch */

    /* initialize our memory semaphore in the shared memory structure */
    returned_sem = msem_init(&(b->semaphore),
			     MSEM_UNLOCKED);

    if (debug) {
      fprintf(where,"our msem is at %x\n",returned_sem);
      fflush(where);
    }
    assert(returned_sem = &(b->semaphore));

    /* no one should be waiting on the barrier */
    b->blocked_threads = 0;
    b->predicate = 0;
    b->barrier_val = val;
    b->valid = BARRIER_VALID;

    /* what *is* an apropriate intial value for pid_t's? somehow, I
       suspect that setting them to 0 is not such a hot idea? */
    for (i = 0; i < NETPERF_MAX_THREADS; i++) {
      b->waiting_pids[i] = (pid_t) -1;
    }
  }

#endif /* USE_THREADS */

  return(0);

}

/* Wait on a barrier */
int
barrier_wait(barrier_t *b)
{

#ifdef USE_THREADS
  int        ret_val,predicate;

  /* Is this a valid barrier ?*/
  if (b->valid != BARRIER_VALID)
    return(EINVAL);

  /* Acquire the mutex for the barrier and the condition variable. */
  ret_val = pthread_mutex_lock(&b->mutex);
  if (ret_val != 0)
    return(ret_val);

  /* Save away our predicate value for this wait operation */
  predicate = b->predicate;

  /* Increment the blocked couter and perform barier operation. */
  b->blocked_threads++;
  if (b->blocked_threads == b->barrier_val) {
    /* Reset the barrier for its next use */
    b->predicate += 1;
    b->blocked_threads = 0;

    /* Last thread: wake-up all blocked threads. */
    ret_val = pthread_cond_broadcast(&b->cv);
    if (ret_val != 0) {
      (void) pthread_mutex_unlock(&b->mutex);
      return(ret_val);
    }
  } else {
    /* Wait until all threads have reached this point */
    while (b->predicate == predicate) {
      ret_val = pthread_cond_wait(&b->cv, &b->mutex);
      if ((ret_val != 0) && (ret_val != EINTR)) {
	(void) pthread_mutex_unlock(&b->mutex);
	return(ret_val);
      }
    }
  }

  /* Release the mutex for the barrier and condition variable. */
  ret_val = pthread_mutex_unlock(&b->mutex);
  if (ret_val != 0)
    return(ret_val);
#else /* USE_THREADS */

  /* the basic idea here is to block a particular signal and grab the
     semaphore for the barrier, and increment the count. if we have
     not reached the desired number of waiters, we register our pid
     in the pid lise, unlock the  semaphore and go into a
     sigsuspend(). if we have hit the desired number of waiters, we
     walk the pid list sending the signal to all the pids stored
     therein. I am hoping that this will sufficiently mimic the
     behaviour of the barrier as described in Threadtime. raj 2/98 */

  int ret_val;
  int i;
  int predicate;
  sigset_t orig_mask;
  sigset_t new_mask;

  if (debug > 1) {
    fprintf(where,
	    "for pid %d in barrier_wait our outer semaphore is at %x\n",
	    getpid(),
	    &(b->semaphore));
  }

  /* ok, block the signal */
  sigemptyset(&new_mask);
  sigaddset(&new_mask,SIGUSR1);
  if (sigprocmask(SIG_BLOCK,
		  &new_mask,
		  &orig_mask) != 0) {
    fprintf(where,
	    "could not block SIGUSR1: errno %d\n",
	    errno);
    fflush(where);
    exit(-1);
  }

  if (debug) {
    fprintf(where,"locking a semaphore at %p\n",&(b->semaphore));
    fflush(where);
  }

  /* now, lock the outer semaphore */
  
  ret_val = msem_lock(&(b->semaphore),0);
  if (ret_val != 0) {
    fprintf(where,
	    "Locking the barrier semaphore failled, errno %d\n",
	    errno);
    fflush(where);
    exit(-1);
  }

  b->blocked_threads++;

  if (b->blocked_threads == b->barrier_val) {

    if (debug) {
      fprintf(where,
	      "blocked %d is val %d\n",
	      b->blocked_threads,
	      b->barrier_val);
      fflush(where);
    }

    /* time to wake everyone up */
    b->predicate += 1;
    b->blocked_threads = 0;

    for (i = 0; i < b->barrier_val - 1; i++) {
      assert(b->waiting_pids[i] != (pid_t) -1);

      if (debug) {
	fprintf(where,"Sending SIGUSR1 to %d\n",b->waiting_pids[i]);
	fflush(where);
      }

      kill(b->waiting_pids[i], SIGUSR1);
      b->waiting_pids[i] = (pid_t) -1;
    };

    ret_val = msem_unlock(&(b->semaphore),0);
    if (ret_val != 0) {
      fprintf(where,
	      "Unlocking the barrier semaphore failled, errno %d\n",
	      errno);
      fflush(where);
      exit(-1);
    }

  }
  else {
    /* we need to wait for the signal */
    b->waiting_pids[b->blocked_threads-1] = getpid();

    predicate = b->predicate;
    /* unlock the semaphore before we go to "sleep" */
    ret_val = msem_unlock(&(b->semaphore),0);
    if (ret_val != 0) {
      fprintf(where,
	      "Unlocking the barrier semaphore failled, errno %d\n",
	      errno);
      exit(-1);
    }

    if (debug) {
      fprintf(where,
	      "Pid %d waiting for a signal\n",
	      getpid());      
      fflush(where);
    }

    /* now wait for that SIGUSR1 to be raised */

    /* this really had me flummoxed - the bit settings seem to be
       opposite what I would have expected, but then, that's life in
       singal land I suppose. raj 2/98 */
    sigfillset(&new_mask);
    sigdelset(&new_mask, SIGUSR1);
    sigsuspend(&new_mask);

    if (debug) {
      fprintf(where,
	     "Pid %d got a signal\n",
	     getpid());
      fflush(where);
    }

  }
  /* at this point, we should have received the SIGUSR1 or been the
     guy who brought us to the count. I suppose that I might check the
     predicate value or something? in any event, I want to reset the
     signal mask back to what it was before we entered this routine */

  if (sigprocmask(SIG_SETMASK,
		  &orig_mask,
		  NULL) != 0) {
    fprintf(where,
	    "could not block SIGUSR1: errno %d\n",
	    errno);
    exit(-1);
  }
#endif /* USE_THREADS */
  return(0);
}

 /* this routine will conver the string into an unsigned integer. it */
 /* is used primarily for the command-line options taking a number */
 /* (such as the socket size) which could be rather large. If someone */
 /* enters 32M, then the number will be converted to 32 * 1024 * 1024. */
 /* If they inter 32m, the number will be converted to 32 * 1000 * */
 /* 1000 */
unsigned int
convert(string)
     char *string;

{
  unsigned int base;
  base = atoi(string);
  if (strstr(string,"K")) {
    base *= 1024;
  }
  if (strstr(string,"M")) {
    base *= (1024 * 1024);
  }
  if (strstr(string,"G")) {
    base *= (1024 * 1024 * 1024);
  }
  if (strstr(string,"k")) {
    base *= (1000);
  }
  if (strstr(string,"m")) {
    base *= (1000 * 1000);
  }
  if (strstr(string,"g")) {
    base *= (1000 * 1000 * 1000);
  }
  return(base);
}


/* This routine will return the two arguments to the calling routine. */
/* If the second argument is not specified, and there is no comma, */
/* then the value of the second argument will be the same as the */
/* value of the first. If there is a comma, then the value of the */
/* second argument will be the value of the second argument ;-) */

void
break_args(s, arg1, arg2)
char	*s, *arg1, *arg2;

{
  char *ns;
  ns = strchr(s,',');
  if (ns) {
    /* there was a comma arg2 should be the second arg*/
    *ns++ = '\0';
    while ((*arg2++ = *ns++) != '\0');
  }
  else {
    /* there was not a comma, we can use ns as a temp s */
    /* and arg2 should be the same value as arg1 */
    ns = s;
    while ((*arg2++ = *ns++) != '\0');
  };
  while ((*arg1++ = *s++) != '\0');
}

void
send_control_message(const int control_sock, 
		     const int32_t test_num,
		     const size_t len, 
		     const char *message) 
{
  char buffer[MAX_CONTROL_MSG_LEN];
  char *print_ptr;

  int32_t length;

  int32_t display_len;


  if (debug) {
    fprintf(where,"send called with %s\n",message);
    fflush(where);
  }

  /* since the maximum message length in bytes is 4096, we know that
     length will never exceed a %4d. in fact, we want all the header
     fields to be fixed length in the message so we know ahead of time
     the "sizeof" the header as a string, which will be of great help
     in the recv_response and recv_request routines. at the moment,
     the control message header will be 20 bytes - four four character
     ints with a space after them*/

  if (len >= MAX_CONTROL_MSG_LEN - CONTROL_HEADER_SIZE) {
    printf("message too long!\n");
    exit(-1);
  }

  length = len + CONTROL_HEADER_SIZE;

  /* that trailing space is important */
  sprintf(buffer,
	  CONTROL_HEADER_FORMAT,
	  length,
	  NETPERF_VERSION,
	  NETPERF_UPDATE,
	  NETPERF_FIX,
	  test_num);

  strncat(buffer,message,len);

  /* I should probably have some error checking here */
  send(control_sock,
       buffer,
       length,
       0);

  if (debug > 1) {
    /* first display the header */
    print_ptr = buffer;
    printf("Dump of sent control message:\n");
    printf("Header      (send)  :%.20s:\n",print_ptr);
    print_ptr += 20;
    display_len = len;
    while (display_len > 0) {
      printf("Data        (send)  :%.*s:\n",
	     MIN(display_len,40),
	     print_ptr);
      /* the check in the while should keep us from going past the end
	 of the message */
      display_len -= 40;
      print_ptr += 40;
    }
  }
}

void
shutdown_control(int sock)
{
  if (debug) {
    fprintf(where,
	    "shutdown_control: shutdown of control connection requested.\n");
    fflush(where);
  }

  /* first, we say that we will be sending no more data on the */
  /* connection */
  if (shutdown(sock,1) == -1) {
    fprintf(where,
	    "shutdown_control: error in shutdown on socket %d. errno %d\n",
	    sock,
	    errno);
    exit(1);
  }
}

int32_t
recv_control_message(int control_sock, 
		     int wait,
		     size_t len, 
		     char *message)

{

  int32_t tot_bytes_received = 0,
          bytes_recvd = 0,
          bytes_left;

  uint32_t message_len;
  uint32_t version;
  uint32_t update;
  int32_t  fix;
  int32_t  test_num;

  int32_t counter;

  fd_set readfds;
  struct timeval timeout;

  char   control_header[CONTROL_HEADER_SIZE];
  char   *read_ptr;
  char   *print_ptr;

  int32_t display_len;

  /* this is a bit brute-force, but it will ensure that subsequent
     calls to recv_control_message with the same buffer will not get
     muddled results */
  for (counter = 0; counter < len; counter++) {
    message[counter] = '\0';
  }

  if (!wait) {
    /* we will not wait indefinitely as we are expecting a timely
     response we only select on the control socket once - it is
     assumed that if the message is split in multiple parts, that all
     of it will arrive. this is not strictly true and so is probably
     not such a good idea, but I am cutting and pasting from netperf2
     at the moment so won't worry about it today. raj 1/98 */
  
    FD_ZERO(&readfds);
    FD_SET(control_sock,&readfds);
    timeout.tv_sec  = 60; /* wait one minute then punt */
    timeout.tv_usec = 0;
    
    /* select had better return one, or there was either a problem or
       a timeout... */
    
    if ((counter = select(FD_SETSIZE,
			  &readfds,
			  0,
			  0,
			  &timeout)) != 1) {
      fprintf(where,
	      "netperf3: control_sock error or timeout. errno %d counter %d\n",
	      errno,
	      counter);
      exit(-1);
    }
  }

  /* first, read-in the control header */
  bytes_left = CONTROL_HEADER_SIZE; /* magic number again */
  read_ptr = control_header;
  while (bytes_left > 0) {
    bytes_recvd = recv(control_sock,
		       read_ptr,
		       bytes_left,
		       0);
    if (bytes_recvd < 0) {
      fprintf(where,
	      "Unexpected byte count on control message of %d errno %d\n",
	      bytes_recvd,
	      errno);
      exit(-1);
    }
    else if (bytes_recvd == 0) {
      /* eeew, a goto :) */
      goto remote_close;
    }
    /* we got some data, decrement the bytes remaining and bump our
       pointer */
    bytes_left -= bytes_recvd;
    read_ptr += bytes_recvd;
  }

  /* at this point, we should have the header, so now we want to
     receive the rest of the message. first, we get some data from the
     control header - probably aught to error check it but wont :) */

  sscanf(control_header,
	 CONTROL_HEADER_FORMAT,
	 &message_len,
	 &version,
	 &update,
	 &fix,
	 &test_num);

  if (version != NETPERF_VERSION) {
    fprintf(where,"netperf3 version level mismatch!\n");
  }
  else if (update != NETPERF_UPDATE) {
    fprintf(where,"netperf3 update level mismatch!\n");
  }
  else if (fix != NETPERF_FIX) {
    fprintf(where,"netperf3 fix level mismatch!\n");
  }

  bytes_left = message_len - CONTROL_HEADER_SIZE;

  /* we don't want anyone overflowing a buffer, that could do nasty
     things to the health, well-being, and security of this program,
     and the system on which it is running. */
  if (bytes_left > len) {
    fprintf(where,
	    "Someone tried to send us more message than we were expecting\n"); 
    exit(-1);
  }

  read_ptr = message;

  while (bytes_left > 0) {
    bytes_recvd = recv(control_sock,
		       read_ptr,
		       bytes_left,
		       0);
    if (bytes_recvd < 0) {
      fprintf(where,
	      "Unexpected byte count on control message of %d errno %d\n",
	      bytes_recvd,
	      errno);
      exit(-1);
    }
    else if (bytes_recvd == 0) {
      /* eeew, another goto !) */
      goto remote_close;
    }
    /* we got some data, decrement the bytes remaining and bump our
       pointer */
    bytes_left -= bytes_recvd;
    read_ptr += bytes_recvd;
  }

  if (debug > 1) {
    /* first display the header */
    printf("Dump of received control message:\n");
    /* shame one cannot embed a macro in the strings... */
    printf(CONTROL_HEADER_FMT,control_header);
    print_ptr = message;
    display_len = message_len - 20;
    while (display_len > 0) {
      printf("Data                :%.*s:\n",
	     MIN(display_len,40),
	     print_ptr);
      /* the check in the while should keep us from going past the end
	 of the message */
      display_len -= 40;
      print_ptr += 40;
    }
  }
  return(test_num);

remote_close:
  if (debug) {
    fprintf(where,
	    "recv_control_message: remote reqeusted shutdown of control\n");
  }

  shutdown_control(control_sock);
  return(SHUTDOWN);
}


/*
  establish_control()

set-up the control connection between me and the server so we can
actually run some tests. if we cannot establish the control
connection, we might as well punt...  

the variables for the control socket are kept in this lib so as to
'hide' them from the upper routines as much as possible so we can
change them without affecting anyone... 

this code comes (with slight modification) from version 2.1 of
netperf. it probably should be made IPv6 aware at some point */


int
establish_control(char hostname[],uint16_t port)
{

  struct	sockaddr_in	server;         /* remote host address */
  struct	servent		*sp;            /* server entity */
  struct	hostent		*hp;            /* host entity */

  unsigned int addr;

  int control_sock;
  char control_message[4096];  /* more with the magic numbers */

  if (debug > 1) {
    printf("establish_control: entered with %s and %d\n",
	   hostname,
	   port);
  }

  /********************************************************/
  /* Set up the control socket control_sock first	*/
  /* for the time being we will assume that all set-ups	*/
  /* are for tcp/ip and sockets...			*/
  /********************************************************/
  
  bzero((char *)&server,
	sizeof(server));
  server.sin_port = htons(port);

  /* it would seem that while HP-UX will allow an IP address (as a */
  /* string) in a call to gethostbyname, other, less enlightened */
  /* systems do not. fix from awjacks@ca.sandia.gov raj 10/95 */  
  /* order changed to check for IP address first. raj 7/96 */

  if ((addr = inet_addr(hostname)) == -1) {
    /* it was not an IP address, try it as a name */
    if ((hp = gethostbyname(hostname)) == NULL) {
      /* we have no idea what it is */
      printf("establish_control: could not resolve the destination %s\n",
	     hostname);
      exit(1);
    }
    else {
      /* it was a valid hostname */
      bcopy(hp->h_addr,
	    (char *)&server.sin_addr,
	    hp->h_length);
      server.sin_family = hp->h_addrtype;
    }
  }
  else {
    /* it was a valid IP address */
    server.sin_addr.s_addr = addr;
    server.sin_family = AF_INET;
  }    

  if (debug > 1) {
    printf("resolved the destination...now creating the socket \n");
  }
  
  control_sock = socket(server.sin_family,
			SOCK_STREAM,
			IPPROTO_TCP);
  
  if (control_sock < 0){
    perror("establish_control: control socket");
    exit(-1);
  }
  
  if (debug > 1) {
    printf("about to connect\n");
  }
  
  if (connect(control_sock, 
	      (struct sockaddr *)&server, 
	      sizeof(server)) < 0) {
    perror("establish_control: control socket connect failed");
    fprintf(where,
	    "Are you sure there is a netserver running on %s at port %d?\n",
	    hostname,
	    port);
    fflush(where);
    exit(1);
  }
  if (debug) {
    fprintf(where,"establish_control: connect completes\n");
  }
  
  /* The Control Socket set-up is done, so now we want to test for
     connectivity on the connection - the normal send and recv message
     code will examine the version information as a matter of course,
     so we can just send a simple message and look at the reply. at
     some point, we may make that a bit more sophisticated. raj 1/98 */

  sprintf(control_message,"%-60s","Netperf Version Check");

  if (debug) {
    fprintf(where,
	    "calling send with %s",control_message);
    fflush(where);
  }

  send_control_message(control_sock,
		       VERSION,
		       strlen(control_message),
		       control_message);

  recv_control_message(control_sock,
		       0,
		       60,
		       control_message);

  return(control_sock);
}

void
calibrate_local_cpu(test_t *test)
{
#ifdef notdef  
  lib_num_loc_cpus = get_num_cpus();

  lib_use_idle = 0;
#ifdef USE_LOOPER
  /* we want to get the looper processes going */
  start_looper_processes();
  lib_use_idle = 1;
#endif /* USE_LOOPER */

  if (local_cpu_rate > 0) {
    /* The user think that he knows what the cpu rate is. We assume */
    /* that all the processors of an MP system are essentially the */
    /* same - for this reason we do not have a per processor maxrate. */
    /* if the machine has processors which are different in */
    /* performance, the CPU utilization will be skewed. raj 4/95 */
    lib_local_maxrate = local_cpu_rate;
  }
  else {
    /* if neither USE_LOOPER nor USE_PSTAT are defined, we return a */
    /* 0.0 to indicate that times or getrusage should be used. raj */
    /* 4/95 */
    lib_local_maxrate = (float)0.0;
#ifdef USE_LOOPER    
    lib_local_maxrate = calibrate_looper(4,10);
#endif
#ifdef USE_PSTAT
#ifdef PSTAT_IPCINFO
    /* one version of pstat needs calibration */
    lib_local_maxrate = calibrate_pstat(4,10);
#endif /* PSTAT_IPCINFO */
#endif /* USE_PSTAT */
  }
  return lib_local_maxrate;
#endif /* notdef */
}

void
cpu_start(test_t *test)
{

  int	i;

  gettimeofday(&(test->time1),
	       &(test->tz));
  
  if (test->local_cpu_usage) {
#ifdef USE_LOOPER
    cpu_method = LOOPER;
    for (i = 0; i < lib_num_loc_cpus; i++){
      lib_start_count[i] = *lib_idle_address[i];
    }
#else
#ifdef	USE_PSTAT
    cpu_method = PSTAT;
#ifdef PSTAT_IPCINFO
    /* we need to know if we have the 10.0 pstat interface */
    /* available. I know that at 10.0, the define for PSTAT_IPCINFO */
    /* was added, but that it is not there prior. so, this should */
    /* act as the automagic compile trigger that I need. raj 4/95 */
    cpu_method = HP_IDLE_COUNTER;
    {
      /* get the idle sycle counter for each processor */
      struct pst_processor *psp;
      union overlay_u {
	long long full;
	long      word[2];
      } *overlay;
      
      psp = (struct pst_processor *)malloc(lib_num_loc_cpus * sizeof(*psp));
      if (pstat_getprocessor(psp, sizeof(*psp), lib_num_loc_cpus, 0) != -1) {
	int i;
	for (i = 0; i < lib_num_loc_cpus; i++) {
	  overlay = (union overlay_u *)&(lib_start_count[i]);
	  overlay->word[0] = psp[i].psp_idlecycles.psc_hi;
	  overlay->word[1] = psp[i].psp_idlecycles.psc_lo;
	  if(debug) {
	    fprintf(where,
		    "\tlib_start_count[%d] = 0x%8.8x%8.8x\n",
		    i,
		    hi_32(&lib_start_count[i]),
		    lo_32(&lib_start_count[i]));
	    fflush(where);
	  }
	}
	free(psp);
      }
    }
#else
    /* this is what we should get when compiling on an HP-UX 9.X */
    /* system. raj 4/95 */
    pstat_getdynamic((struct pst_dynamic *)&pst_dynamic_info,
		     sizeof(pst_dynamic_info),1,0);
    for (i = 0; i < PST_MAX_CPUSTATES; i++) {
      cp_time1[i] = pst_dynamic_info.psd_cpu_time[i];
    }
#endif /* PSTAT_IPCINFO */
#endif /* USE_PSTAT */
#endif /* USE_LOOPER */
  }
}


void
cpu_stop(test_t *test)
{
#ifndef WIN32
#include <sys/wait.h>
#endif /* WIN32 */

  int	sec,
        usec;

  int	i;

  if (test->local_cpu_usage) {
#ifdef USE_LOOPER
    for (i = 0; i < lib_num_loc_cpus; i++){
      lib_end_count[i] = *lib_idle_address[i];
    }
    /* now go through and kill-off all the child processes */
    for (i = 0; i < lib_num_loc_cpus; i++){
      /* SIGKILL can leave core files behind - thanks to Steinar Haug */
      /* for pointing that out. */
      kill(lib_idle_pids[i],SIGTERM);
    }
    /* reap the children */
    while(waitpid(-1, NULL, WNOHANG) > 0) { }
    
    /* finally, unlink the mmaped file */
    munmap((caddr_t)lib_base_pointer,
	   ((NETPERF_PAGE_SIZE * PAGES_PER_CHILD) * 
	    lib_num_loc_cpus));
    unlink("/tmp/netperf_cpu");
#else
#ifdef	USE_PSTAT
#ifdef PSTAT_IPCINFO
    {
      struct pst_processor *psp;
      union overlay_u {
	long long full;
	long      word[2];
      } *overlay;
      psp = (struct pst_processor *)malloc(lib_num_loc_cpus * sizeof(*psp));
      if (pstat_getprocessor(psp, sizeof(*psp), lib_num_loc_cpus, 0) != -1) {
	for (i = 0; i < lib_num_loc_cpus; i++) {
	  overlay = (union overlay_u *)&(lib_end_count[i]);
	  overlay->word[0] = psp[i].psp_idlecycles.psc_hi;
	  overlay->word[1] = psp[i].psp_idlecycles.psc_lo;
	  if(debug) {
	    fprintf(where,
		    "\tlib_end_count[%d]   = 0x%8.8x%8.8x\n",
		    i,
		    hi_32(&lib_end_count[i]),
		    lo_32(&lib_end_count[i]));
	    fflush(where);
	  }
	}
	free(psp);
      }
      else {
	fprintf(where,"pstat_getprocessor failure: errno %d\n",errno);
	fflush(where);
	exit(1);
      }
    }
#else /* not HP-UX 10.0 or later */
    {
      pstat_getdynamic(&pst_dynamic_info, sizeof(pst_dynamic_info),1,0);
      for (i = 0; i < PST_MAX_CPUSTATES; i++) {
	cp_time2[i] = pst_dynamic_info.psd_cpu_time[i];
      }
    }    
#endif /* PSTAT_IPC_INFO */
#else
#ifdef WIN32
#ifdef NT_SDK
    NTSTATUS status;                                         /* robin */
    /* robin */
    status = NtQuerySystemTime( &systime_end );              /* robin */
    if (debug) {                                             /* robin */
      if (!NT_SUCCESS(status)) {                            /* robin */
	fprintf(where,"NtQuerySystemTime "                 /* robin */
		"failed: 0x%08X\n",                  /* robin */
		status);                             /* robin */
      }                                                     /* robin */
    }                                                        /* robin */
    status = NtQuerySystemInformation (                      /* robin */
				       SystemProcessorPerformanceInformation,       /* robin */
				       &sysperf_end,                                /* robin */
				       sizeof(sysperf_end),                         /* robin */
				       NULL );                                      /* robin */
    /* robin */
    if (debug) {                                             /* robin */
      if (!NT_SUCCESS(status)) {                            /* robin */
	fprintf(where,"NtQuerySystemInformation "          /* robin */
		"failed: 0x%08X\n",                  /* robin */
		status);                             /* robin */
      }                                                     /* robin */
    }                                                        /* robin */
    /* robin */
#endif /* NT_SDK */
#endif /* WIN32 */
#endif /* USE_PSTAT */
#endif /* USE_LOOPER */
  }
  
  gettimeofday(&(test->time2),
	       &(test->tz));
  
  if (test->time2.tv_usec < test->time1.tv_usec) {
    test->time2.tv_usec	+= 1000000;
    test->time2.tv_sec	-= 1;
  }
  
  sec	= test->time2.tv_sec - test->time1.tv_sec;
  usec	= test->time2.tv_usec - test->time1.tv_usec;
  test->elapsed_time	= (float)sec + ((float)usec/(float)1000000.0);

}

void
sub_add_timeval(struct timeval time1,
		struct timeval time2,
		struct timeval *timesum)
{

  /* deal with the case where the second timestamp has more
     microseconds than the first. */ 
  if (time2.tv_usec < time1.tv_usec) {
    time2.tv_usec	+= 1000000;
    time2.tv_sec	-= 1;
  }
  
  /* now add the differences to the sum */
  timesum->tv_sec	+= (time2.tv_sec - time1.tv_sec);
  timesum->tv_usec	+= (time2.tv_usec - time1.tv_usec);

  /* and check for "overflow" on the tv_usec field */
  if (timesum->tv_usec >= 1000000) {
    timesum->tv_usec -= 1000000;
    timesum->tv_sec += 1;
  }

}

double
calc_thruput(test_t *test, double units_received)

{
  double	divisor;
  double        result;

  /* We will calculate the thruput in libfmt units/second */
  switch (test->format_units) {
  case 'K':
    divisor = 1024.0;
    break;
  case 'M':
    divisor = 1024.0 * 1024.0;
    break;
  case 'G':
    divisor = 1024.0 * 1024.0 * 1024.0;
    break;
  case 'k':
    divisor = 1000.0 / 8.0;
    break;
  case 'm':
    divisor = 1000.0 * 1000.0 / 8.0;
    break;
  case 'g':
    divisor = 1000.0 * 1000.0 * 1000.0 / 8.0;
    break;
  case 't':
    /* this is for a _RR test where the units are trans/s */
    divisor = 1.0;

  default:
    divisor = 1024.0;
  }

  if (debug) {
    fprintf(where,
	    "divisor %f elapsed %f format %c units %f\n",
	    divisor,
	    test->elapsed_time,
	    test->format_units,
	    units_received);
    fflush(where);
  }

  result = units_received / divisor / test->elapsed_time;

  if (debug) {
    fprintf(where,"result %f\n",result);
    fflush(where);
  }

  return (result);

}


