#include <stdio.h>
#include <getopt.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <pcre.h>
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

unsigned int max=100000;
unsigned int unmatched=50;
unsigned int max_buffer=0;
unsigned int max_re_size=50;	//Max length of RE to match (must be smaller than half of maximum buffer length).
const int overlap=10;
char *buffer=NULL;
int file_number=0;
char *filename=NULL;
char quiet=0;

struct re_job {
  struct pcre *start_re;
  struct pcre *end_re;
  struct re_job *next;
  char *prefix;
  char *postfix;
} *jobs = NULL;

struct re_job *current_job;

//Adds a new re job to the end of jobs, but advancing current_job along.
struct re_job *new_re_job(void)
{
  struct re_job *temp;
  assert(temp=(struct re_job*)malloc(sizeof(struct re_job)));
  temp->start_re=NULL;
  temp->end_re=NULL;
  temp->next=NULL;
  temp->prefix="";
  temp->postfix="dump";

  if(current_job)
    current_job->next=temp;
  current_job=temp;
  return (temp);
};

/***********************************
 * compiles a PCRE given in buffer, interpreting options
 * just like perl. First char of buffer is considered to be the delimiter
 * Buffer is modified.
 ***********************************/
pcre *pcre_compile_string(char *buffer)
{
	int options = 0;
	int size=strlen(buffer);
	char *pp = buffer + 1;
	char delimiter = buffer[0];
	char *re = strdup(buffer + 1);
	pcre *temp;

	//   int do_g,do_G;   // repeat do not make sense in RE matching mode

	const char *error;
	int error_offset;

	//Advance *pp to the closing delimiter
	while (*pp != 0 && pp<buffer+size) {
		if (*pp == '\\' && pp[1] != 0)
			pp++;
		else if (*pp == delimiter)
			break;
		pp++;
	}

	if(pp==buffer+size) {
	  printf("No closing delimiter in %s\n",buffer);
	  exit(-1);
	};

	//pp should now point at the options, so terminate re:
	re[pp - buffer - 1] = 0;
	pp++;

	while (*pp != 0) {
		switch (*pp++) {
			//    case 'g': do_g = 1; break;
		case 'i':
			options |= PCRE_CASELESS;
			break;
		case 'm':
			options |= PCRE_MULTILINE;
			break;
		case 's':
			options |= PCRE_DOTALL;
			break;
		case 'x':
			options |= PCRE_EXTENDED;
			break;

		case 'A':
			options |= PCRE_ANCHORED;
			break;
		case 'E':
			options |= PCRE_DOLLAR_ENDONLY;
			break;
			//    case 'G': do_G = 1; break;
		case 'U':
			options |= PCRE_UNGREEDY;
			break;
		case 'X':
			options |= PCRE_EXTRA;
			break;
		case '8':
			options |= PCRE_UTF8;
			break;

		case '\n':
		case ' ':
			break;
		default:
			printf("** Unknown option '%c'\n", pp[-1]);
			exit(-1);
		}
	}

	//Now we parsed all the options - its time to compile the RE:
	temp = pcre_compile(re, options, &error, &error_offset, NULL);
	free(re);

	if(!temp) {
	  printf("Error in RE: %s at %d, %s\n",
				    re + 1, error_offset, error);
	  exit(-1);
	};
	

	return (temp);
};

/*
//Add a start_re to the end of the re list.
void add_start_re(char *re)
{
  struct re_job *temp=jobs;
  while(temp->next) temp=temp->next;

  if(temp->start_re) {
    temp->next=new_re_job();
    temp=temp->next;
  };

  temp->start_re=pcre_compile_string(re);
};
*/

/*
void add_end_re(char *re)
{
  struct re_job *temp=jobs;
  while(temp->next) temp=temp->next;

  if(temp->end_re) {
    temp->next=new_re_job();
    temp=temp->next;
  };

  temp->end_re=pcre_compile_string(re);
};

*/

/* Search through all the re_jobs and return the first offset in the buffer which matches any of the re's. job will also return that structure that matched. */
int find_start_re(char *buffer,int length, struct re_job **cur) {
     int offsets[45];
     int size_offsets=15;

     int min_offset=0;
     int temp=0;
     struct re_job *i = jobs;

     while(i && i->start_re) {
       temp=pcre_exec(i->start_re,0,buffer,length,0,0,offsets,size_offsets);
       if(temp>0 && min_offset<offsets[0] )  {
	 min_offset=offsets[0];
	 *cur=i;
       };
       i = i->next;
     };
     
     return(min_offset);
};

void license(void)
{
	printf("exgrep - a regular expression forensic extraction tool\n\
Type exgrep -h for help.\n\
\n\
Copyright (C) 2003, Michael Cohen (scudette@reapoff.no-ip.com)\n\
\n\
This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.\n\
\n\
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.\n\
\n\
You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA\n\
");
};

/* help: output a helpfull message */
void help(void)
{
	printf("exgrep - a regular expression forensic extraction tool Version 0.3\n\
Usage: exgrep [options]\n\
\n\
-l,--license\t\tPrints out the License terms for this product\n\
-h,--help\t\tThis cruft\n\
-s,--start RE\t\tAn RE to mark the start of the required region (mand)\n\
-e,--end RE\t\tAn RE to mark the end of the required region (optional)\n\
-m,--max INT\t\tMaximum number of bytes between start and end REs\n\
-x,--extension STR\tExtension to append to the end of each file\n\
-p,--prefix STR\t\tString to prepend to file names created\n\
-f,--file STR\t\tFilename to open (else reads from STDIN)\n\
-r,--re-size INT\tMaximum number bytes to be matched by start RE\n\
-q,--quiet\t\tDo not write any files, just print the offsets where start re was found\n\
");
};

void write_to_file(char *buffer, int length, long long unsigned int offset,struct re_job *current_job)
{
  int out_fd;
  char *filename;

  //If we need to stay quiet, we get out of here...
  if(quiet) return;

  assert(filename=(char *) malloc((strlen(current_job->prefix)+50)*sizeof(char)));
  
  snprintf(filename,strlen(current_job->prefix)+49,"%s%016llu.%s",current_job->prefix,offset,current_job->postfix);
  out_fd=creat(filename,S_IRUSR| S_IWUSR);
  
  if(out_fd<0) {
    perror(filename);
    exit(-1);
  };
  
  if(write(out_fd,buffer,length)<length) {
    perror("write to file");
    exit(-1);
  };
  
  close(out_fd);
  free(filename);
  file_number++;
};

//Keeps retrying until the requested number of bytes has been received.
int main(int argc, char *argv[])
{
     int c,fd=0;
     int length=0;
     int offsets[45];
     int size_offsets=15;

     //This is really huge 64bits to accommodate huge HDDs.
     long long unsigned int read_size=0;

     jobs=new_re_job();
     current_job=jobs;
     
     //Parse all options
     while (1) {
	  int option_index = 0;
	  static struct option long_options[] = {
	       {"license", 0, 0, 'l'},
	       {"help", 0, 0, 'h'},
	       {"quiet",0,0,'q'},
	       {"start", 1, 0, 's'},
	       {"end", 1, 0, 'e'},
	       {"max", 1, 0, 'm'},
	       {"unmatched", 1, 0, 'u'},
	       {"file", 1, 0, 'f'},
	       {"extension", 1, 0, 'x'},
	       {"prefix", 1, 0, 'p'},
	       {"re-size", 1, 0, 'r'},
	       {0, 0, 0, 0}
	  };

	  c = getopt_long(argc, argv,
			  "lhs:e:qm:u:f:x:p:r:",
			  long_options, &option_index);
	  if (c == -1)
	       break;

	  switch (c) {
	  case 'q':
	      quiet=1;
	      break;
	  case 'h':
	       help();
	       exit(0);
	       break;
	  case 'l':
	       license();
	       exit(0);
	       break;
	  case 's':
	  {
	    //	       add_start_re(optarg);
	    //If we have a start_re already, we have to make a new job...
	    if(current_job->start_re) new_re_job();
	    
	    current_job->start_re = pcre_compile_string(optarg);
	    break;
	  };
	  case 'e':		//Try and parse the netmask
	  {
	    //	       add_end_re(optarg);
	    if(current_job->end_re) new_re_job();
	    current_job->end_re=pcre_compile_string(optarg);
	    break;
	  };
	  case 'm':
	       max = atoi(optarg);
	       break;
	  case 'u':
	       unmatched = atoi(optarg);
	       break;
	  case 'f':
	       filename=optarg; 
	       break;
	  case 'p':
	       current_job->prefix=optarg;
	       break;
	  case 'x':
	    current_job->postfix=optarg; 
	    break;
	  case 'r':
	    max_re_size = atoi(optarg);
	    break;
	  default:
	       printf("Unknown option '%c'", c);
	       exit(-1);
	  }
     }
     if (optind < argc) {
	  printf("non-option ARGV-elements: ");
	  while (optind < argc)
	       printf("%s ", argv[optind++]);
	  printf("\n");
     }

     //Is this linux specific???? 
# define O_LARGEFILE    0100000

     /* If we have a file specified, just open it... */
     if(filename &&  (fd = open(filename,O_LARGEFILE| O_RDONLY)) < 0 ){
       fprintf(stderr,"Cant open %s for reading\n",filename);
       exit(-1);
     };

     max_buffer=max+unmatched+overlap+1;

     assert(buffer=(char *)malloc((max_buffer) * sizeof(char)));
     /* Here we try to fill the buffer with data from fd. Then we search for the start sequence in there. */

     while((length=read(fd,buffer,max_buffer))>0) {
       int temp=0;

       //       temp=pcre_exec(jobs->start_re,0,buffer,length,0,0,offsets,size_offsets);
       temp=find_start_re(buffer,length,&current_job);
       
       //If we find the starting sequence we shift the data in the buffer backwards so the starting sequence lines up on the beginning of the buffer. We then read some more data to fill the buffer again.
       while(jobs->start_re && temp>0) {
	 int result;

	 read_size+=temp;
	 printf("Found start at %016llu\n",read_size);

	 length-=temp;
	 memmove(buffer,buffer+temp,length);
	 result=read(fd,buffer+length,max_buffer-length);
	 if(result>0) {
	   length+=result;
	 };

	 //Now we should have a complete sized buffer, and we can search for the end sequence in it. 
	 if(jobs->end_re && pcre_exec(jobs->end_re,0,buffer,max,0,0,offsets,size_offsets)>0) {
	   buffer[offsets[1]]=0;

	   write_to_file(buffer,offsets[1],read_size,current_job);

	   length-=offsets[0];
	   memmove(buffer,buffer+offsets[0],length);
	   read_size+=offsets[0];

	   //Unfortunately we did not find the end sequence in the current buffer, so we give up and save the entire buffer to disk. 
	 } else {
	   
	   write_to_file(buffer,length,read_size,current_job);

	   //Here we shift the data in the buffer back to lose enough characters off the begining to avoid the start sequence from matching again. Note unmatched must be really small, typically not more than 1024.
	   length-=unmatched;
	   memmove(buffer,buffer+unmatched,length);
	   read_size+=unmatched; 

	   //Read again to refill the buffer
	   result=read(fd,buffer+length,max_buffer - length);
	   if(result>0) {
		   length+=result;
	   };
	 };
	 //	 temp=pcre_exec(jobs->start_re,0,buffer,length,0,0,offsets,size_offsets);
	 temp=find_start_re(buffer,length,&current_job);
       };

       //We did not find the start sequence in this buffer. Shift the buffer back by max_re_size to prevent an RE being missed due to being split by the end of the buffer
       if(temp<=0) {
	 read_size+=length;
       };
     };


     return(0);
}
