/* Copyright (c) 2004 Yutaka Sato. All rights reserved.
 *
 * any2fdif.c
 *
 * 040602 created as a HTML to MAIL converter
 * 040609 extended to a FDIF generator taking generic input types
 *
 */
#include <sys/types.h>
#include <sys/stat.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <ctype.h>
#include <time.h>
#include "vsocket.h"
#include "proc.h"
#include "yarg.h"
#include "file.h"
#include "ystring.h"
#include "credhy.h"
#include "dglib.h"

typedef struct sed_env SedEnv;
int sed_compile(SedEnv *se,PCStr(command));
void sed_execute1(SedEnv *se,PCStr(in),PVStr(out),int err);

int scanAttrs(PCStr(src),int an,PCStr(nam1),PVStr(val1),int vsiz1,PCStr(nam2),PVStr(val2),int vsiz2);
static void scanUrls(FILE *flist,int optr,PCStr(strip),PCStr(pre),PCStr(conv),FILE *out,FILE *descf);
void updateSrt(PCStr(base),PCStr(strip),PCStr(pre));
int extractAuthor(PCStr(str),PCStr(top),PVStr(author),int size,PCStr(url),int dump);
int copy2auth(PCStr(copyr),PVStr(author),int size,PCStr(url),int force);
int fscanTag(FILE *in,int ch,PVStr(tag),int tsiz);
int isRFC822(FILE *fp);
const char *strskip(PCStr(s),PCStr(p));
char *wbstrcpy(PVStr(dst),PCStr(dx),PCStr(src),int len);
int scanMeta(PCStr(src),PVStr(nam),int nsiz,PVStr(con),int csiz);
void backseek(FILE *in, int disp);

static double Start;

#ifdef MAIN
int isFullURL(PCStr(url)){
	return 0;
}
FILE *URLget(PCStr(url),int reload,FILE *out){
	fprintf(stderr,"URLget() is not available.\n");
	return 0;
}
const char *DELEGATE_verdate(){
	return 0;
}

int serviceport(PCStr(service)){
	return 0;
}
int decomp_absurl(PCStr(url),PVStr(proto),PVStr(login),PVStr(upath),int ulen){
	*proto = *login = *upath = 0;
	return 0;
}
int scan_protositeport(PCStr(url),PVStr(proto),PVStr(userpasshost),PVStr(port)){
	*proto = *userpasshost = *port = 0;
	return 0;
}
char *html_nextTagAttrX(void *vBase,PCStr(html),PCStr(ctype),PVStr(rem),const char **tagp,const char **attrp,int *convmaskp){
	return 0;
}
const char *getURLgetURL(){
	return 0;
}

int any2fdif_main(int ac,const char *av[]);
int main(int ac,const char *av[]){
	return any2fdif_main(ac,av);
}
#endif

void fflushTmp(FILE *fp);
FILE *getTmp(int ti);
#define TMP0	0
#define TMP1	1
#define TMP2	2
#define TMP3	3
#define TMP4	4
#define TMP5	5

static const char *ccx_outcode = "a-b-r-EUC-JP";
static const char *hide_addr = "From:%l@%r..%c";
static const char *indexbase = "freyasx/bank";

#define T_TEXT	1
#define T_MAIL	2
#define T_HTML	3
#define T_HTTP	4
#define T_JS	5
#define T_MBOX	6

#define CH_COPYR	0xA9
static char thru8[32] = {CH_COPYR};
static int Itype = T_HTML;

typedef struct {
	int	opt_v;
	int	opt_q;
	int	opt_d;
	int	opt_u;
	int	opt_a;
	int	printIfText;
	int	NumAny;
	int	NumUrl;
	int	NumPut;
	int	MaxPut;
	int	Codes[1000];
	int	withAuthor;
	int	guessedAuthor;
	void	*UrlSed;
  const	char	*UrlMount[2]; /**/

	int	Fsize;
	int	Mtime;
	int	Atime;

	FILE	*Out;
	FILE	*Descf;
	FILE	*Summf;

	MStr(	e_Url,1024);
	MStr(	e_Descr,1024);
	MStr(	e_Keywd,1024);
	MStr(	e_title,1024);
	MStr(	e_Heads,1024);
	MStr(	e_author,1024);
	MStr(	e_Address,1024);
	MStr(	e_Links,2048);
	MStr(	e_Location,1024);
	MStr(	e_Xuri,1024);
	MStr(	e_XRefs,1024);
	int	Ccx[64];
	int	CCXdisable;

	MStr(	e_baseDir,256);
} AFEnv;

static AFEnv *AFenv;
#define opt_v		AFenv->opt_v
#define opt_q		AFenv->opt_q
#define opt_d		AFenv->opt_d
#define opt_u		AFenv->opt_u
#define opt_a		AFenv->opt_a
#define printIfText	AFenv->printIfText
#define NumAny		AFenv->NumAny
#define NumUrl		AFenv->NumUrl
#define NumPut		AFenv->NumPut
#define MaxPut		AFenv->MaxPut
#define Codes		AFenv->Codes
#define withAuthor	AFenv->withAuthor
#define guessedAuthor	AFenv->guessedAuthor
#define UrlSed		AFenv->UrlSed
#define UrlMount	AFenv->UrlMount
#define Fsize		AFenv->Fsize
#define Mtime		AFenv->Mtime
#define Atime		AFenv->Atime
#define Out		AFenv->Out
#define Descf		AFenv->Descf
#define Summf		AFenv->Summf
#define Url		AFenv->e_Url
#define Descr		AFenv->e_Descr
#define Keywd		AFenv->e_Keywd
#define Title		AFenv->e_title
/**/
#define Heads		AFenv->e_Heads
#define Author		AFenv->e_author
/**/
#define Address		AFenv->e_Address
#define Links		AFenv->e_Links
#define Location	AFenv->e_Location
#define XUri		AFenv->e_Xuri
/**/
#define XRefs		AFenv->e_XRefs
#define Ccx		AFenv->Ccx
#define CCXdisable	AFenv->CCXdisable
#define baseDir		AFenv->e_baseDir

#define MAX_TYPES	128
/*
 * screening by suffix of URL
 * this table should be loadable from any2fdif.conf
 */
static struct {
  const	char	*ext;
	int	 itype;
  const	char	*cnv;
	int	 cnt;
	int	 ixs[8];
} types[MAX_TYPES] = {
	{"",      T_HTTP}, /* undefined extensions */
	{"/",     T_HTML},
	{"/MAIL", T_MAIL},
	{"htm",   T_HTML},
	{"html",  T_HTML},
	{"shtml", T_HTML},
	{"txt",   T_TEXT},
	{"cgi",   T_HTTP},
	{"asp",   T_HTTP},
	{"jsp",   T_HTTP},
	{"php",   T_HTTP},
	{"php3",  T_HTTP},
	{"fcg",   T_HTTP},
	{"exe",   T_HTTP},
	{"pl",    T_HTTP},
	{"c",     T_TEXT},
	{"cc",    T_TEXT},
	{"h",     T_TEXT},
	{"o"},
	{"a"},
	{"js"},

	{"gif"},
	{"jpg"},
	{"swf"},
	{"css"},
	{"png"},
	{"ico"},
	{"jpeg"},
	{"pdf"},

	{"bmp"},
	{"asx"},
	{"curl"},
	{"dmg"},
	{"mov"},
	{"mid"},
	{"mpg"},
	{"pjpeg"},
	{"ram"},
	{"ras"},
	{"rm"},
	{"sit"},
	{"wav"},
	{"wmv"},
	{"xbm"},
	{"xpi"},
	
	{"bin"},
	{"bz2"},
	{"gz"},
	{"hqx"},
	{"ipk"},
	{"lzh"},
	{"rpm"},
	{"tar"},
	{"tgz"},
	{"zip"},
	
	{"doc"},
	{"ppt"},
	{"ps"},
	{"xml"},
	0
};
static void setConv(PCStr(ext),PCStr(cnv))
{	int xi;
	const char *cext;

	for( xi = 0; cext = types[xi].ext; xi++ ){
		if( strcaseeq(ext,cext) ){
			types[xi].cnv = strdup(cnv);
			return;
		}
	}
	if( xi < MAX_TYPES ){
		types[xi].ext = ext;
		types[xi].itype = T_TEXT;
		types[xi].cnv = strdup(cnv);
	}
}
static const char *getConv(PCStr(path))
{	const char *ext;
	const char *cext;
	int xi;

	if( (ext = strrchr(path,'.')) == 0 )
		return 0;
	ext++;

	for( xi = 0; cext = types[xi].ext; xi++ ){
		if( strcaseeq(ext,cext) ){
			return types[xi].cnv;
		}
	}
	return 0;
}

static int doindextype(PCStr(url),int *ixp)
{	int xi;
	const char *ext;
	const char *cext;
	const char *dp;
	const char *qp;
	const char *cp;
	char cch;

	if( cp = strtailstr(url,"/@") ){
		cch = cp[1];
		((char*)cp)[1] = 0;
	}else
	if( cp = strtailstr(url,"/=") ){
		cch = cp[1];
		((char*)cp)[1] = 0;
	}
	if( qp = strchr(url,'?') )
		truncVStr(qp);

	ext = 0;
	if( strtailchr(url) == '/' )
		ext = "/"; 
	else
	if( dp = strrchr(url,'/') ){
		if( dp = strrchr(dp+1,'.') )
			ext = dp + 1;
		else	ext = "";
	}

	if( ext == 0 ){
		xi = 0;
		goto EXIT;
	}
	for( xi = 0; cext = types[xi].ext; xi++ ){
		if( *cext && strcaseeq(ext,cext) )
			goto EXIT;
	}
	xi = 0;
EXIT:
/*
if( xi == 0 && *ext != 0 ) fprintf(stderr,"+++ [%s] %s\n",ext,url);
*/
	if( cp ) ((char*)cp)[1] = cch; /**/
	if( qp ) *(char*)qp = '?'; /**/
	types[xi].cnt += 1;
	*ixp = xi;
	return types[xi].itype;
}

#define E_OK		0
#define E_EOF		1
#define E_CTRL		2
#define E_ENCODED	3
#define E_NONTEXT	4
#define E_BINARY	5
#define E_EMPTY		6
#define E_DIR		7

static void dumptypes(){
	int xi,cnt;
	const char *cext;
	int i,total;

	total = 0;
	for( xi = 0; cext = types[xi].ext; xi++ ){
		total += types[xi].cnt;
	}

 fprintf(stderr,"suffix  count  ratio indexed ctrl ntxt encd  bin null  dir\n");
 fprintf(stderr,"------ ------ ------ ------- ---- ---- ---- ---- ---- ----\n");
	for( xi = 0; cext = types[xi].ext; xi++ ){
		if( cnt = types[xi].cnt ){
			fprintf(stderr,"%-6s %6d %5.1f%%",
				cext,cnt,(cnt*100)/(float)total);
			if( opt_a || types[xi].itype ){
				fprintf(stderr," %7d %4d %4d %4d %4d %4d %4d",
					types[xi].ixs[E_OK],
					types[xi].ixs[E_CTRL],
					types[xi].ixs[E_NONTEXT],
					types[xi].ixs[E_ENCODED],
					types[xi].ixs[E_BINARY],
					types[xi].ixs[E_EMPTY]+
					types[xi].ixs[E_EOF],
					types[xi].ixs[E_DIR]
					);
			}
			fprintf(stderr,"\n");
		}
	}

	if( Codes[200] || Codes[302] ){
		fprintf(stderr,"\n");
		fprintf(stderr,"  code  count\n");
		fprintf(stderr,"------ ------\n");
		for(i = 0; i < 1000; i++ ){
			if( Codes[i] ){
				fprintf(stderr,"%6d %6d\n",i,Codes[i]);
			}
		}
		fprintf(stderr,"\n");
	}

	fprintf(stderr,"Indexed: %d (with Author: %d+%d)\n",NumPut,
		withAuthor,guessedAuthor);
}

static void Lap(int force,int outlen,PCStr(fmt),...);

typedef struct {
	FILE   *flist;
	int	flen;
	int	optr;
  const	char   *strip;
  const	char   *pre;
  const	char   *conv;
	FILE   *out;
	FILE   *descf;
} Opts;
static const char *optx; /* URLs to be excluded */
static const char *optX; /* symbolic link or link to another site */

static scanDirFunc scanfile(PCStr(file),PCStr(dir),Opts *opts,int optr)
{	CStr(path,1024);

	if( optr < 0 )
		return -1;

	if( strcmp(file,".") == 0 || strcmp(file,"..") == 0 )
		return 0;

	sprintf(path,"%s/%s",dir,file);
	if( fileIsdir(path) ){
		Scandir(path,scanDirCall scanfile,path,opts,optr-1);
	}else{
		fprintf(opts->flist,"%s\n",path);
		opts->flen++;
		if( opts->flen % 100 == 0 ){
			Lap(0,0,"%d files found\n",opts->flen);
		}
	}
	return 0;
}
static void scandir(PCStr(url),int optr,PCStr(strip),PCStr(pre),PCStr(conv),FILE *out,FILE *descf)
{	Opts opts;

	if( optr <= 0 ){
		return;
	}
	Lap(1,0,"scanning directory [%s]\n",url);
	opts.flist = tmpfile();
	opts.flen = 0;
	opts.optr = optr;
	opts.strip = strip;
	opts.pre = pre;
	opts.conv = conv;
	opts.out = out;
	opts.descf = descf;

	Scandir(url,scanDirCall scanfile,url,&opts,optr);
	Lap(1,0,"%d files under [%s]\n",opts.flen,url);
	Start = Time();

	fflush(opts.flist);
	fseek(opts.flist,0,0);
	scanUrls(opts.flist,optr-1,strip,pre,conv,out,descf);
	fclose(opts.flist);
}

typedef struct {
	char   *u_url;
	int	u_crc;
	int	u_done;
} UrlScan;
static UrlScan *Urls;
static int Urlx;
static int Urli;
static void addUrl(PCStr(base),PCStr(url),int done)
{	int i,crc;
	int ix;

	if( !doindextype(url,&ix) ){
		if( opt_a == 0 )
			return;
	}
	crc = strCRC32(url,strlen(url));
	for( i = 0; i < Urli; i++ ){
		if( Urls[i].u_crc == crc )
		if( strcmp(Urls[i].u_url,url) == 0 ){
			return;
		}
	}
	if( Urlx <= Urli ){
		if( Urlx == 0 ){
			Urlx = 1024;
			Urls = (UrlScan*)malloc(Urlx*sizeof(Url));
		}else{
			Urlx = Urlx * 2;
			Urls = (UrlScan*)realloc(Urls,Urlx*sizeof(Url));
		}
	}
	Urls[Urli].u_url = strdup(url);
	Urls[Urli].u_crc = crc;
	Urls[Urli].u_done = done;
	Urli++;
}
char *getnextlink(PVStr(url),int size)
{	int i;

	for( i = 0; i < Urli; i++ ){
		if( Urls[i].u_done == 0 ){
			Urls[i].u_done = 1;
			QStrncpy(url,Urls[i].u_url,size);
			return (char*)url;
		}
	}
	return NULL;
}
const char *getURLgetURL();
static void normalizeURL(PVStr(url),int siz)
{	CStr(proto,64);
	CStr(login,256);
	CStr(upath,1024);
	CStr(nupath,1024);
	const char *dp;

	decomp_absurl(url,AVStr(proto),AVStr(login),AVStr(upath),sizeof(upath));
	if( dp = strrchr(login,':') ){
		if( atoi(dp+1) == serviceport(proto) )
			truncVStr(dp);
	}
	if( dp = strchr(upath,'#') )
		truncVStr(dp);
	nupath[0] = 0;
	chdir_cwd(AVStr(nupath),upath,1);

	if( strtailchr(url) == '/' && strtailchr(nupath) != '/' )
		strcat(nupath,"/");

	/* this should be done in chdir_cwd() ... */ {
		refQStr(dp,upath); /**/
		const char *sp;
		char ch;
		char pch;
		dp = upath;
		pch = '/';
		for( sp = nupath; (ch = *sp); sp++ ){
			if( ch == '\\' )
				ch = '/';
			if( ch == '/' ){
				if( pch == '/' )
					continue;
			}
			setVStrPtrInc(dp,ch);
			pch = ch;
		}
		setVStrEnd(dp,0);
	}

	sprintf(url,"%s://%s/%s",proto,login,upath);
}
static void getbase(PCStr(base),PVStr(basedir),int size)
{	CStr(proto,256);
	CStr(host,256);
	CStr(port,256);
	const char *dp;

	linescanX(base,AVStr(basedir),size);
	if( dp = strstr(basedir,"//") ){
		const char *tp;
		dp += 2;
		if( tp = strpbrk(dp,"?") )
			truncVStr(tp);
		if( tp = strchr(dp,'/') )
			dp = tp + 1;
		if( tp = strrchr(dp,'/') )
			((char*)tp)[1] = 0;
	}
}

static int scanlinks(FILE *in,int optr,PCStr(abase))
{	CStr(base,1024);
	CStr(line,1024);
	CStr(rem,1024);
	CStr(ctype,1024);
	const char *np;
	const char *tag;
	const char *attr;
	CStr(tagn,32);
	CStr(attrn,32);
	CStr(url,1024);
	CStr(proto,256);
	CStr(host,256);
	CStr(port,256);
	const char *dp;
	CStr(baseserv,1024);
	CStr(basedir,1024);

	strcpy(base,abase);
	if( getURLgetURL() )
		strcpy(base,getURLgetURL());
	normalizeURL(AVStr(base),sizeof(base));
	addUrl(abase,base,1);

	if( fgets(line,sizeof(line),in) == NULL ){
/*
		fprintf(stderr,"## scanserv ... empty %s %s\n",abase,base);
*/
		return 0;
	}
	if( strncmp(line,"HTTP/",5) != 0 ){
/*
		fprintf(stderr,"## scanserv ... non-HTTP\n");
*/
		return 0;
	}
	ctype[0] = 0;
	for(;;){
		if( fgets(line,sizeof(line),in) == NULL )
			break;
		if( strncmp(line,"Content-Type:",13) == 0 ){
			dp = line+13;
			while(isspace(*dp))
				dp++;
			wordscanY(dp,AVStr(ctype),sizeof(ctype),"^; \t\r\n");
		}
		if( *line == '\r' || *line == '\n' )
			break;
	}
	if( strncmp(ctype,"text/html",9) != 0 ){
/*
		fprintf(stderr,"## scanserv ... non-HTML [%s]%s\n",ctype,base);
*/
		return 0;
	}

	{
		scan_protositeport(base,AVStr(proto),AVStr(host),AVStr(port));
		sprintf(baseserv,"%s://%s",proto,host);
		if( *port ){
			Xsprintf(TVStr(baseserv),":%s",port);
		}
		getbase(base,AVStr(basedir),sizeof(basedir));
	}

	rem[0] = 0;
	for(;;){
		if( fgets(line,sizeof(line),in) == NULL ){
			break;
		}
		np = line;
		for(;;){
			tag = 0;
			attr = 0;
			np = html_nextTagAttrX(NULL,np,ctype,AVStr(rem),&tag,&attr,NULL);
			if( np == 0 )
				break;
			if( tag )
				wordScan(tag,tagn);
			else	strcpy(tagn,"?");
			if( attr )
				wordscanY(attr,AVStr(attrn),sizeof(attrn),"^=");
			else	strcpy(attrn,"?");

			if( strncasecmp(tagn,"<A",2) != 0
			 && strncasecmp(attrn,"HREF",4) != 0 ){
			}else
			if( strncasecmp(tagn,"<FRAME",6) != 0
			 && strncasecmp(attrn,"SRC",3) != 0 ){
			}else{
				continue;
			}

			if( *np == '\\' ){
				continue;
			}
			wordscanY(np,AVStr(url),sizeof(url),"^\"\'> \t\r\n");
			if( dp = strchr(url,'#') )
				truncVStr(dp);

			if( *url == 0 ){
				continue;
			}
			if( strchr(url,'?') ){
				continue;
			}
			if( strncasecmp(url,"javascript:",11) == 0 ){
				continue;
			}
			if( !isFullURL(url) ){
				if( *url == '/' ){
					Strins(AVStr(url),baseserv);
				}else{
					Strins(AVStr(url),basedir);
				}
			} 

			normalizeURL(AVStr(url),sizeof(url));
			if( optX != 0 ){
				if( strstr(url,optX) == 0 ){
					continue;
				}
			}else{
				if( strncmp(url,baseDir,strlen(baseDir))!=0 ){
/*
fprintf(stderr,"### %s %s %s\n",tagn,attrn,url);
*/
					continue;
				}
			}
			addUrl(abase,url,0);
/*
			fprintf(stderr,"## %s %s=%s\n",tagn,attrn,url,line);
			usleep(100*1000);
*/
		}
	}
	return 0;
}

static int totallen;
int any2fdif(PCStr(pre),PCStr(strip),PCStr(apath),int mi,PCStr(sub),PCStr(conv),FILE *ain,FILE *out,FILE *descf,int *itypep);
void xany2fdif(int ix,PCStr(pre),PCStr(strip),PCStr(url),PCStr(xconv),FILE *in,FILE *out,FILE *descf,int *itypep)
{	int mi;
	int ixs;
	CStr(sub,1024);
	int poff = 0;
	int noff;
	int sItype = Itype;

	for( mi = 1;; mi++ ){
		sprintf(sub,"?n=%d&off=%d",mi,ftell(in));
		ixs = any2fdif(pre,strip,url,mi,sub,xconv,in,out,descf,itypep);

		noff = ftell(in);
		totallen += (noff - poff);
		poff = noff;
		Lap(0,totallen,0);

		if( ix == 0 && (*itypep == T_MAIL || *itypep == T_MBOX) ){
			types[0].cnt -= 1;
			ix = 2;
			types[ix].cnt += 1;
		}
		types[ix].ixs[ixs] += 1;

		if( *itypep == T_MBOX && !feof(in) ){
			Itype = T_MBOX;
			continue;
		}
		break;
	}
	Itype = sItype;
}
static void scanUrls(FILE *flist,int optr,PCStr(strip),PCStr(pre),PCStr(conv),FILE *out,FILE *descf)
{	CStr(url,1024);
	const char *dp;
	CStr(buff,0x10000);
	FILE *in;
	int size,octime,omtime,oatime;
	int ix;
	int itype;
	const char *fgot;

	for(;;){
		fgot = getnextlink(AVStr(url),sizeof(url));
		if( fgot == NULL ){
		fgot = fgets(url,sizeof(url),flist);
			if( fgot != NULL ){
				getbase(url,AVStr(baseDir),sizeof(baseDir));
			}
		}
		Lap(fgot==NULL,totallen,0);

		if( fgot == NULL )
			break;
		if( dp = strpbrk(url,"\r\n") )
			truncVStr(dp);
		++NumAny;

		printIfText = 0;
		if( optx ){
			if( strstr(url,optx) ){
				continue;
			}
		}
		if( !doindextype(url,&ix) ){
			if( opt_a == 0 )
				continue;
			else	printIfText = 1;
		}
		if( !opt_q )
			Lap(1,0,"%s\n",url);

		in = NULL;
		if( isFullURL(url) ){
			size = 0;
			octime = 0;
			omtime = 0;
			oatime = 0;

			normalizeURL(AVStr(url),sizeof(url));
			in = URLget(url,0,NULL);
			if( in != NULL && !feof(in) ){
				size = file_size(fileno(in));
				fseek(in,0,0);
				itype = T_HTTP;
			}
		}else{
			if( fileIsdir(url) ){
				scandir(url,optr,strip,pre,conv,out,descf);
			}
		    if( File_sizetime(url,&size,&octime,&omtime,&oatime)==0 )
			in = fopen(url,"r");
		}
		if( in != NULL ){
			int code;
			const char *xconv;
			Fsize = size;
			Mtime = omtime;
			Atime = oatime;
			itype = 0;

			xconv = getConv(url);
			if( xconv == 0 )
				xconv = conv;

			xany2fdif(ix,pre,strip,url,xconv,in,out,descf,&itype);
			if( isFullURL(url) && 0 < optr ){
				fseek(in,0,0);
				clearerr(in);
				scanlinks(in,optr,url);
			}

			fclose(in);
			code = set_utimes(url,oatime,omtime);
		}

		if( MaxPut && MaxPut <= NumPut ){
			dumptypes();
			exit(0);
		}

	}
}

static void makedir(PCStr(dir))
{
	if( File_is(dir) ){
		if( File_isreg(dir) ){
			fprintf(stderr,"Is flat file: %s\n",dir);
			exit(-1);
		}
	}else{
		if( mkdir(dir,0750) != 0 ){
			fprintf(stderr,"Can't create: %s\n",dir);
			exit(-1);
		}
	}
}
static void scanFilelist(int outbase,PCStr(outname),PCStr(outmode),PCStr(strip),PCStr(pre),PCStr(conv),FILE *flist,int optr)
{	CStr(outdir,1024);
	CStr(path,1024);
	const char *dp;
	const char *dbname;
	CStr(outnameb,1024);
	const char *env;
	CStr(base,1024);

	if( strcmp(outname,"-") == 0 ){
		Out = stdout;
	}else{
		if( outbase || strchr(outname,'/') == 0 ){
			if( fileIsdir("bank") ){
				sprintf(base,"bank/%s/",outname);
				outname = base;
			}else
			if( env = getenv("FSXHOME") ){
				sprintf(base,"%s/bank/%s/",env,outname);
				outname = base;
			}else
			if( fileIsdir("../bank") ){
				sprintf(base,"../bank/%s/",outname);
				outname = base;
			}else{
			    sprintf(base,"%s/%s",getenv("HOME"),indexbase);
			    if( fileIsdir(base) ){
				Xsprintf(TVStr(base),"/%s/",outname);
				outname = base;
			    }
			}
		}
		if( strtailchr(outname) == '/' ){
			strcpy(outdir,outname);
			outdir[strlen(outdir)-1] = 0;
			makedir(outdir);
			if( dp = strrchr(outdir,'/') ){
				dbname = dp+1;
			}else	dbname = outdir;
			sprintf(outnameb,"%s/%s",outdir,dbname);
			outname = outnameb;
		}
		if( !opt_u ){
			sprintf(path,"%s.fdif",outname);
			Out = fopen(path,outmode);
			fprintf(stderr,"FDIF file: %s\n",path);

			sprintf(path,"%s.desc",outname);
			Descf = fopen(path,outmode);
			fprintf(stderr,"Desc file: %s\n",path);

			sprintf(path,"%s.summ",outname);
			Summf = fopen(path,outmode);
			fprintf(stderr,"Summary file: %s\n",path);
		}
	}
	if( opt_u ){
		updateSrt(outname,strip,pre);
		return;
	}
	if( Out == NULL ){
		fprintf(stderr,"Error: %s\n",outname);
		exit(-1);
	}
	scanUrls(flist,optr,strip,pre,conv,Out,Descf);
}
void updateSrt(PCStr(base),PCStr(strip),PCStr(pre))
{	CStr(line,1024);
	CStr(path,1024);
	int off,size,mtime,atime,crc,nsize,nmtime,natime;
	int pi;
	FILE *summf;
	FILE *atimef;
	int plen = strlen(pre);
	int slen = strlen(strip);

	sprintf(path,"%s.sum",base);
	summf = fopen(path,"r");
	fprintf(stderr,"Summary file: %X %s\n",summf,path);
	if(summf == NULL){
		sprintf(path,"%s.summ",base);
		summf = fopen(path,"r");
		fprintf(stderr,"Summary file: %X %s\n",summf,path);
		if(summf == NULL){
			return;
		}
	}

	sprintf(path,"%s.atime.srt",base);
	atimef = fopen(path,"w");
	fprintf(stderr,"Atime file: %X %s\n",atimef,path);
	if(atimef == NULL)
		return;

 fprintf(stderr,"+++ update sort %X(%d)  summary %X(%d/%d)\n",
atimef,ftell(atimef),summf,ftell(summf),file_size(fileno(summf)));

	for( pi = 0;; pi++ ){
		if( fgets(line,sizeof(line),summf) == NULL )
			break;
		if( Xsscanf(line,"%x %x %x %x %d %[^\r\n]",&off,&atime,&mtime,&crc,&size,AVStr(path)) != 6 )
		{
			continue;
		}
		if(*pre && strncmp(path,pre,plen) == 0 ){
			bcopy(path+plen,path,strlen(path+plen)+1);
		}
		if(*strip){
			Strins(AVStr(path),strip);
		}
		if( File_sizetime(path,&nsize,NULL,&nmtime,&natime) == 0 ){
if( opt_v )
if( natime != atime )
printf(">%d (%5d) %X %X %s\n",atime!=natime,pi,atime,natime,path);
			natime = htonl(natime);
			fwrite(&natime,1,sizeof(natime),atimef);
		}else{
 fprintf(stderr,"BAD[%s]\n",path);
			fseek(atimef,4,1);
		}
	}
	fflush(atimef);
 fprintf(stderr,"#### %d / %d\n",ftell(atimef),file_size(fileno(atimef)));
}

static void usage(int ac,const char *av[],int code)
{	const char *ver = DELEGATE_verdate();
	if( ver ){
		fprintf(stderr,"Any2fdif on %s\n",ver);
	}
 fprintf(stderr,"\n");
 fprintf(stderr,
"Usage: %s [indexname] [[-c cnv] [-s spfx] [-p ppfx] [-f listfile]]*\n",
 av[0]);
 fprintf(stderr,"\n\
Generate a 'FDIF' file to be input to FreyaSX indexer 'findex'.  It outputs\n\
to a FDIF file named 'indexname' (or to stdout by default).  It inputs\n\
documents from files listed in 'listfile' (or stdin by default), then apply\n\
filter command 'cnv' for each file (if specified), then convert it to FDIF\n\
format to be output.  The name of each document can be full URL or name of\n\
local file.  When the name is that of local file, the URL for it can be\n\
created from each file name, stripping prefix string 'spfx' (if specified),\n\
and adding prefix string 'ppfx' (if specified).\n");
 fprintf(stderr,"\nExample:\n");
 fprintf(stderr,"\
  %% find /web/data -type f -name \"*.html\" -print > webidx.list\n");
 fprintf(stderr,"\
  %% %s webidx -s /web/data/ -p http://www.my.org/ -f web.list\n",av[0]);
 fprintf(stderr,"\
  %% findex webidx\n\n");
 fprintf(stderr,"\
  (doing above in a single step)\n\
  %% %s -r /web/data -s /web/data -p http://www.my.org/ | findex webidx\n\n",
av[0]);
 fprintf(stderr,"\
  %% find /web/data | %s | findex webidx\n",av[0]);
 fprintf(stderr,"\
  %% %s -r http://www.delegate.org/freyasx | findex sx\n",av[0]);
 fprintf(stderr,"\nOptions:\n\
  indexname        output to the indexname\n\
  -f listfile      scan each file listed in the listfile\n\
  -r directory     recursively scan files under the directory\n\
  -r URL           recursively scan HTML pages under the URL\n\
  -s prefix        strip the prefix from each file-name\n\
  -p prefix        insert the prefix string to be URL\n\
  -c 'command'     preprocess command (as a shell command)\n\
  -c.ext 'command' preprocess command for files with .ext extension\n\
  -v               verbose\n\
  -q               quiet\n\
");
	exit(code);
}

extern int MIME_CONV;
extern int MIME_SPACE_ENCODING;

int xrealpath(PCStr(path),PVStr(rpath),int size);

int any2fdif_main(int ac,const char *av[])
{	int ai;
	const char *a1;
	const char *outname = "default";
	const char *outmode = "w";
	int outbase = 0;
	const char *pre = "";
	const char *strip = "";
	FILE *in = stdin;
	FILE *flist = NULL;
	FILE *tmp;
	CStr(execpath,256);
	const char *env;
	int numin = 0;
	int numout = 0;
	const char *conv = 0;
	FILE *Tmp4 = 0;
	int optr = 0;

	/* these are disalbed in delegated.c */
	MIME_CONV = 0xFFFFFFFF; /* enable all MIME conversion */
	MIME_SPACE_ENCODING = 2; /* default encoding -- typeB */

	AFenv = (AFEnv*)calloc(1,sizeof(AFEnv));

	Start = Time();
	toFullpathENV("PATH",av[0],"r",AVStr(execpath),sizeof(execpath));
	env = getenv("LIBPATH");
	if( env == 0 ){
		const char *dp;
		CStr(xd,256);
		CStr(rx,256);
		CStr(libenv,1024);
		if( dp = strrchr(execpath,'/') ){
			QStrncpy(xd,execpath,dp-execpath+1);
		}else	strcpy(xd,".");
		sprintf(libenv,"LIBPATH=%s:%s/../lib:%s/../etc",xd,xd,xd);

		if( xrealpath(execpath,AVStr(rx),sizeof(rx)) ){
			if( dp = strrchr(rx,'/') )
				truncVStr(dp);
		Xsprintf(TVStr(libenv),":%s:%s/../lib:%s/../etc",
				rx,rx,rx);
		}
		putenv(strdup(libenv));
	}

	if( ac < 2 ){
		if( isatty(fileno(stdin)) )
			usage(ac,av,-1);
	}

	for( ai = 1; ai < ac; ai++ ){
		a1 = av[ai];
		if( strcmp(a1,"-v") == 0 ){
			opt_v = 1;
		}else
		if( strncmp(a1,"-q",2) == 0 ){
			opt_q = 1;
		}else
		if( strcmp(a1,"-a") == 0 ){
			opt_a = 1;
		}else
		if( strcmp(a1,"-o") == 0 || strcmp(a1,"-a") == 0 ){
			numout++;
			if( ai+1 < ac ){
				outname = av[++ai];
				if( strcmp(a1,"-o") == 0 )
					outmode = "w";
				else	outmode = "a";
				outbase = 0;
			}
		}else
		if( strcmp(a1,"-b") == 0 ){
			if( ai+1 < ac ){
				outname = av[++ai];
				outmode = "w";
				outbase = 1;
			}
		}else
		if( strncmp(a1,"-c",2) == 0 ){
			if( ai+1 < ac ){
				if( a1[2] == '.' ){
					setConv(a1+3,av[++ai]);
				}else{
					conv = av[++ai]; 
				}
			}
		}else
		if( strncmp(a1,"-f",2) == 0 ){
			numin++;
			if( ai+1 < ac )
				++ai;
		}else
		if( strcmp(a1,"-p") == 0 ){
			if( ai+1 < ac )
				pre = av[++ai];
		}else
		if( strcmp(a1,"-e") == 0 ){
			if( ai+1 < ac ){
				struct sed_env *sed_new();
				UrlSed = sed_new();
				sed_compile((SedEnv*)UrlSed,av[++ai]);
			}
		}else
		if( strcmp(a1,"-m") == 0 ){
			/* -m "http://%S/%S/%S/%d/%d http://%S/%S/%S/%S%S" */
			if( ai+1 < ac ){
				const char *mount = av[++ai];
				CStr(pat,256);
				const char *dp = wordScan(mount,pat);
				UrlMount[0] = strdup(pat);
				wordScan(dp,pat);
				UrlMount[1] = strdup(pat);
			}
		}else
		if( strncmp(a1,"-X",2) == 0 ){
			if( ai+1 < ac )
				optX = av[++ai];
		}else
		if( strncmp(a1,"-x",2) == 0 ){
			if( ai+1 < ac )
				optx = av[++ai];
		}else
		if( strncmp(a1,"-r",2) == 0 ){
			/* -r[N] recursion depth N */
			if( a1[2] )
				optr = atoi(&a1[2]);
			else	optr = 16;
		}else
		if( strcmp(a1,"-s") == 0 ){
			if( ai+1 < ac )
				strip = av[++ai];
		}else
		if( strcmp(a1,"-u") == 0 ){
			opt_u = 1;
			outmode = "r+";
		}else
		if( a1[0] == '-' && isdigit(a1[1]) ){
			MaxPut = atoi(a1+1);
		}else
		if( a1[0] != '-' && ai == 1 ){
			outname = a1;
			outmode = "w";
			outbase = 1;
		}
	}
	if(opt_u){
		scanFilelist(outbase,outname,outmode,strip,pre,conv,flist,optr);
		return 0;
	}
	if( numout == 0 ){
		if( !isatty(fileno(stdout)) ){
			outname = "-";
			outmode = "a";
			outbase = 0;
			fprintf(stderr,"+++ -o -\n");
		}
	}

	for( ai = 1; ai < ac; ai++ ){
		a1 = av[ai];
		if( strcmp(a1,"-v") == 0 ){
		}else
		if( strncmp(a1,"-q",2) == 0 ){
			opt_q = 1;
		}else
		if( strcmp(a1,"-a") == 0 ){
		}else
		if( strcmp(a1,"-o") == 0 ){
			if( ai+1 < ac ){
				++ai;
			}
		}else
		if( strcmp(a1,"-b") == 0 ){
			if( ai+1 < ac ){
				++ai;
			}
		}else
		if( strncmp(a1,"-c",2) == 0 ){
			if( ai+1 < ac ){
				if( a1[2] == '.' ){
				}else{
					conv = av[++ai]; 
				}
			}
		}else
		if( strncmp(a1,"-f",2) == 0 ){
			NumUrl = 0;
			switch(a1[2]){
				case 't': Itype = T_TEXT; break;
				case 'm': Itype = T_MAIL; break;
				case 'h': Itype = T_HTML; break;
			}
			if( ai+1 < ac ){
				if( strcmp(av[++ai],"-") == 0 )
					flist = stdin;
				else	flist = fopen(av[ai],"r");
				if( flist == NULL ){
					fprintf(stderr,"Error: %s\n",av[ai]);
					exit(-1);
				}
				scanFilelist(outbase,outname,outmode,strip,pre,conv,flist,optr);
			}
		}else
		if( strcmp(a1,"-p") == 0 ){
			if( ai+1 < ac )
				pre = av[++ai];
		}else
		if( strcmp(a1,"-e") == 0 ){
			if( ai+1 < ac ){
				ai++;
			}
		}else
		if( strcmp(a1,"-m") == 0 ){
			if( ai+1 < ac ){
				ai++;
			}
		}else
		if( strncmp(a1,"-X",2) == 0 ){
			if( ai+1 < ac )
				optX = av[++ai];
		}else
		if( strncmp(a1,"-x",2) == 0 ){
			if( ai+1 < ac )
				optx = av[++ai];
		}else
		if( strncmp(a1,"-r",2) == 0 ){
			/* -r[N] recursion depth N */
			if( a1[2] )
				optr = atoi(&a1[2]);
			else	optr = 16;
		}else
		if( strcmp(a1,"-s") == 0 ){
			if( ai+1 < ac )
				strip = av[++ai];
		}else
		if( a1[0] == '-' ){
		}else{
			if( ai == 1 ){
				/* regard 1st arg as -b arg */
				if( outbase == 0 ){
					fprintf(stderr,"+++ -b %s\n",a1);
					outname = a1;
					outmode = "w";
					outbase = 1;
				}
			}else{
				if( Tmp4 == 0 )
					Tmp4 = getTmp(TMP4);
				fprintf(Tmp4,"%s\n",a1);
			}
		}
	}
	if( numin == 0 ){
		if( !isatty(fileno(stdin)) ){
			fprintf(stderr,"+++ -f -\n");
			flist = stdin;
			scanFilelist(outbase,outname,outmode,strip,pre,conv,flist,optr);
		}
	}
	if( Tmp4 != 0 ){
		fflushTmp(Tmp4);
		scanFilelist(outbase,outname,outmode,strip,pre,conv,Tmp4,optr);
	}
	dumptypes();
	return 0;
}

typedef struct {
	int	crc;
	int	len;
} CRC;
static void addCRC(CRC *crc,PCStr(str),int len)
{
	crc->crc = strCRC32add(crc->crc,str,len);
	crc->len += len;
}

static void encEnt(CRC *crc,FILE *out,PCStr(tag),PCStr(attr),PCStr(buf))
{	CStr(outb,2048);

	addCRC(crc,buf,strlen(buf));
	addCRC(crc,"",1);

	encodeEntitiesX(buf,AVStr(outb),sizeof(outb));
	fprintf(out,"<%s",tag);
	if( *attr ) fprintf(out," %s",attr);
	fprintf(out,">\n%s\n</%s>\n",outb,tag);
}

#define DESCBLOCKSIZE	256
typedef struct {
	int	d_size;
	int	d_date;
	int	d_lastmod;
	MStr(	d_url,DESCBLOCKSIZE-(3*sizeof(int)));
} DescRecord;
static void makeDesc(DescRecord *Desc,int fsize,int mtime,PCStr(digest))
{	int rsiz,ulen,alen,tlen,dlen;
	MrefQStr(dp,Desc->d_url); /**/
	const char *dx;

	Desc->d_size = htonl(fsize);
	Desc->d_date = 0;
	Desc->d_lastmod = htonl(mtime);

	rsiz = sizeof(DescRecord) - ((int)&Desc->d_url - (int)Desc);
	ulen = strlen(Url) + 1; 
	alen = strlen(Author) + 1; 
	tlen = strlen(Title) + 1; 
	dlen = strlen(digest) + 1; 
	if( rsiz < ulen+alen+tlen )
		if( (rsiz*2)/3 < ulen ) ulen = (rsiz*2)/3;
	if( rsiz < ulen+alen+tlen )
		if( rsiz/3 < alen ) alen = rsiz/3;
	if( rsiz < ulen+alen+tlen )
		if( rsiz < tlen ) tlen = (rsiz*3)/4;
	if( rsiz < ulen+alen+tlen+dlen )
		dlen = rsiz - (ulen+alen+tlen);
	dp = Desc->d_url;
	dx = Desc->d_url + (sizeof(Desc->d_url)-1);
	dp = wbstrcpy(AVStr(dp),dx,Url,ulen);
	dp = wbstrcpy(AVStr(dp),dx,Author,alen);
	dp = wbstrcpy(AVStr(dp),dx,Title,tlen);
	dp = wbstrcpy(AVStr(dp),dx,digest,dlen);
}
static int skipLine(PCStr(line))
{	const char *dp;
	char ch;

	if( strncasecmp(line,"-----BEGIN",10)==0 )
		return 3;

	if( strncasecmp(line,"In message",10)==0
	 || strncasecmp(line,"In article",10)==0
	 || strncasecmp(line,"In <",4)==0
	 || strncasecmp(line,"On ",3)==0 && (
		   strstr(line,"wrote")
		|| strstr(line,")\n")
		|| strstr(line,">\n")
	    )
	 || ((dp = strskip(line,"wrote"  )) && strstr(dp,">\n"))
	 || ((dp = strskip(line,"said:"  )) && (*dp==' '||*dp=='\n'))
	 || ((dp = strskip(line,"wrote:" )) && (*dp==' '||*dp=='\n'))
	 || ((dp = strskip(line,"writes:")) && (*dp==' '||*dp=='\n'))
	)
		return 1;

	if( line[0] == '>'
	 || line[0] == '|'
	 || strncmp(line," >",2) == 0
	 || strncmp(line," |",2) == 0
	)
		return 1;

	for( dp = line; ch = *dp; dp++ ){
		if( !isalnum(ch) && !isspace(ch)
		 && ch != '.' && ch != '-' && ch != '_' )
			break;
	}
	if( *dp == '>' )
		return 1;

	return 0;
}

void getCharset(PCStr(where),PVStr(charset),PCStr(fval));
static void scanHtml0(PCStr(inb),PVStr(charset))
{	const char *dp;
	CStr(nam,32);
	CStr(con,256);
	int si,na;

	dp = inb;
	for( si = 0; si < 8 && (dp = strcasestr(dp,"<META")); si++ ){
		dp += 5;
		na = scanAttrs(dp,4, "HTTP-EQUIV",AVStr(nam),sizeof(nam),
			"CONTENT",AVStr(con),sizeof(con));
		if( na != 2 )
			continue;
		if( strcaseeq(nam,"Content-Type") ){
			getCharset("HTML",AVStr(charset),con);
		}
	}
}
static void makeUrl(PVStr(url),int usiz,PCStr(pre),PCStr(path))
{	refQStr(dp,url); /**/
	const char *sp;

	sprintf(url,"%s%s",pre,path);
	if( strtailstr(url,"/=") ){
		/* index.html or so cached by DeleGate */
		setVStrEnd(url,strlen(url)-1);
	}
	/* URL path is escaped to be used as a file name of cache */
	url_unescape(AVStr(url),AVStr(url),usiz,"*?%@=");

	for( sp = url; *sp; sp++ ){
		if( *sp == ':' ){
			sp++;
			if( *sp == '/' )
				sp ++;
			break;
		}
	}
	dp = (char*)sp;
	for(; *sp; sp++ ){
		setVStrPtrInc(dp,*sp);
		if( sp[0] == '/' ){
			while( sp[1] == '/' ){
				sp++;
			}
		}
	}
	setVStrEnd(dp,0);
	if( UrlSed ){
		CStr(buf,0x1000);
		sed_execute1((SedEnv*)UrlSed,url,AVStr(buf),0);
		QStrncpy(url,buf,usiz);
	}
	if( UrlMount[0] ){
		CStr(buf,0x1000);
		const char *rsp;
		const char *rfp;
		UTag *uv[33],ub[32];
		int uc;
		uvinit(uv,ub,32);
		FStrncpy(buf,url);
		uc = uvfromsfX(buf,0,UrlMount[0],uv,&rsp,&rfp);
		if(rsp[0] == 0 && rfp[0] == 0 ){
			uvtosf(AVStr(url),usiz,UrlMount[1],uv);
		}
	}
}

/* avoid multi-bytes char is broken in receiver line buffer (findex) */
void
fputsX(PCStr(str),FILE *out)
{	const char *sp;
	char ch;
	int pch = '\n';
	int len;
	int do_break = 0;
/*
	fputs(str,out);
*/
	len = 0;
	for( sp = str; (ch = *sp) != 0; sp++ ){
		if( ch == '\n' ){
			len = 0;
			do_break = 0;
		}else{
			if( do_break ){
				if( (ch & 0x80) == 0 ){
/*
 fprintf(stderr,"#### ADD NL-A ch=[%c] len=%d\n",ch,len);
*/
					putc('\n',out);
					len = 0;
					do_break = 0;
				}
			}else{
				if( 1024 < len ){
					do_break = 1;
				}
			}
			len++;
		}
		putc(ch,out);
		pch = ch;
	}
	if( pch != '\n' ){
/*
 fprintf(stderr,"#### ADD NL-B pch=%X len=%d\n",0xFF&pch,len);
*/
		putc('\n',out);
	}
}
int scanText(FILE *in,FILE *out,PVStr(charset),PCStr(apath))
{	CStr(inb,16*1024);
	CStr(outb,32*1024); /* for SJIS 1byte kana -> EUC 2byte kana */
	int nb,rcc,ri,ch;
	int bodysiz = 0;

	for( nb = 0; rcc = fread(inb,1,sizeof(inb)-1,in); nb++ ){
		inb[rcc] = 0;
		if( charset && charset[0] == 0 && nb == 0 ){
			scanHtml0(inb,AVStr(charset));
			if( charset[0] ){
				CCX_setincode((CCXP)Ccx,charset);
			}
		}
		for( ri = 0; ri < rcc; ri++){
			ch = 0xFF & inb[ri];
			if( ch == 0 ){
				if( opt_v )
				fprintf(stderr,"binary file.\n",Url);
				return E_BINARY;
			}
		}
		CCXexec((CCXP)Ccx,inb,strlen(inb),AVStr(outb),sizeof(outb));
		bodysiz += rcc;
		fputsX(outb,out);
	}
	if( bodysiz == 0 ){
		if( 1 /* without .txt or .html suffix */ ){
			if( opt_v )
			fprintf(stderr,"empty file.\n");
			return E_EMPTY;
		}
	}
	return 0;
}

int sysfilter(PCStr(conv),FILE *in,FILE *out);
void scanMail(FILE *in,FILE *out,PCStr(apath),int ismbox,int lev);
void scanHtml(FILE *in,FILE *tmp,PCStr(apath));

int any2fdif(PCStr(pre),PCStr(strip),PCStr(apath),int mi,PCStr(sub),PCStr(conv),FILE *ain,FILE *out,FILE *descf,int *itypep)
{	FILE *in = ain;
	const char *path = apath;
	CStr(head,256);
	int ismbox = 0;
	CStr(date,64);
	struct tm *tm; int ch,issp,wassp;
	CStr(inb,16*1024);
	const char *ip;
	const char *op;
	CStr(outb,32*1024); /* for SJIS 1byte kana -> EUC 2byte kana */
	int topoff;
	int itype = 0;
	CStr(digest,256);
	refQStr(dp,digest); /**/
	const char *dx;
	const char *cp;
	int ic;
	DescRecord Desc;
	int skip,skiphead;
	FILE *Tmp0,*Tmp1;
	CRC crcb = {0,0}, *crc = &crcb;
	int pch,ite;
	int off0;
	CStr(ctype,64);
	CStr(charset,64);
	int ctime,mtime,atime;
	int soff;
	CStr(prevb,1024);
	int noauth;
	int copymark;
	int ecode = 0;
	CStr(cwd,256);
	getcwd(cwd,sizeof(cwd));

	if( strncmp(path,strip,strlen(strip)) == 0 )
		path += strlen(strip);
	if( strncmp(path,"./",2) == 0 )
		path += 2;
	if( fileIsdir(path) ){
		return E_DIR;
	}
	if( conv != NULL ){
		FILE *Tmp3;
		Tmp3 = getTmp(TMP3);
		ftruncate(fileno(Tmp3),(off_t)0);
		sysfilter(conv,in,Tmp3);
		fseek(Tmp3,0,0);
		in = Tmp3;
	}

	Title[0] = 0;
	Keywd[0] = 0;
	Descr[0] = 0;
	Heads[0] = 0;
	Author[0] = 0;
	Address[0] = 0;
	Links[0] = 0;
	Location[0] = 0;
	XUri[0] = 0;
	XRefs[0] = 0;
	ctype[0] = 0;
	charset[0] = 0;
	digest[0] = 0;

	CCXcreate("*",ccx_outcode,(CCXP)Ccx);
	CCXthru8((CCXP)Ccx,thru8);
	CCXdisable = 0;

	topoff = ftell(in);
	soff = ftell(out);
	if( fgets(head,sizeof(head),in) == NULL ){
		return E_EOF;
	}
	if( strncmp(head,"From ",5) == 0 ){
		ismbox = 1;
	}

	itype = Itype;
	if( Itype == T_MBOX ){
		/* MBOX */
	}else
	if( strncmp(head,"HTTP/",5) != 0 ){
		int ismail;
		fseek(in,topoff,0);
		ismail = isRFC822(in);
		if( itype == T_MAIL ){
			if( ismail <= 0 ){
				return E_BINARY;
			}
		}else{
			if( 0 < ismail ){
/*
 fprintf(stderr,"#### ISMAIL %5d %s\n",ismail,apath);
*/
				itype = T_MAIL;
			}
		}
	}else{
		int code = 0;
		sscanf(head,"HTTP/%*s %d",&code);
		Codes[code%1000] += 1;
		if( code < 200 || 300 <= code ){
			if( opt_v )
			fprintf(stderr,"[%s%s] not user data (%d)\n",pre,path,
				code);
			return E_CTRL;
		}
		while( fgets(head,sizeof(head),in) != NULL ){
			CStr(fnam,64);
			CStr(fval,128);
			if( *head == '\r' || *head == '\n' ){
				Fsize -= ftell(in);
				break;
			}
			scan_field1(head,AVStr(fnam),sizeof(fnam),AVStr(fval),sizeof(fval));
			if( strcaseeq(fnam,"Content-Type") ){
				const char *sp;
				getCharset("HEAD",AVStr(charset),fval);
				FStrncpy(ctype,fval);
				if( sp = strchr(fval,';') )
					truncVStr(sp);
				if( streq(fval,"text/html") )
					itype = T_HTML;
				else
				if( strstr(fval,"application/xhtml") )
					itype = T_HTML;
				else
				if( streq(fval,"text/plain") )
					itype = T_TEXT;
				else
				if( streq(fval,"application/x-javascript")
				 || streq(fval,"application/javascript")
				 || streq(fval,"text/javascript")
				)	itype = T_JS;
			}else
			if( strcaseeq(fnam,"Last-Modified") ){
				Mtime = scanHTTPtime(fval);
			}else
			if( strcaseeq(fnam,"Content-Encoding") ){
				if( opt_v )
				fprintf(stderr,"[%s%s] encoded (%s)\n",
					pre,path,fval);
				return E_ENCODED;
			}
		}
		if( itype == 0 || itype == T_JS ){
			if( strstr(ctype,"image/") == NULL )
			if( strstr(ctype,"text/css") == NULL )
			if( opt_v )
			fprintf(stderr,"[%s] not text [%s%s]\n",ctype,pre,path);
			return E_NONTEXT;
		}

		if( strncmp(path,"nntp/",5) == 0){
			pre = "nntp://";
			path += 5;
			itype = T_MAIL;
			fseek(in,topoff,0);
		}
		if( strncmp(path,"http/",5) == 0){
			pre = "http://";
			path += 5;
		}
		if( strncmp(path,"https/",6) == 0){
			pre = "http://";
			path += 6;
		}
	}

	makeUrl(AVStr(Url),sizeof(Url),pre,path);
	NumUrl++;
	if( NumUrl == 1 ){
		fprintf(stderr,"First URL: %s\n",Url);
		fprintf(stderr,"...\n");
	}
	if( opt_v ){
		fprintf(stderr,"%04d [%s] ",NumUrl,apath);
	}

	if( charset[0] ){
		CCX_setincode((CCXP)Ccx,charset);
	}

	if( itype == T_HTML ){
		Tmp0 = getTmp(TMP0);
		if( ecode = scanText(in,Tmp0,AVStr(charset),apath) )
			return ecode;

		fflushTmp(Tmp0);
		in = Tmp0;
		CCXdisable = 1;
		/*
		CCXcounts((CCXP)Ccx);
		*/
	}

	Tmp1 = getTmp(TMP1);
	switch( itype ){
		case T_TEXT: ecode = scanText(in,Tmp1,VStrNULL,apath); break;
		case T_HTML: scanHtml(in,Tmp1,apath); break;
		case T_MBOX:
		case T_MAIL: scanMail(in,Tmp1,apath,ismbox,0); break;
	}
	if( ecode != 0 ){
		return ecode;
	}
	fflushTmp(Tmp1);
	if( Location[0] ){
		makeUrl(AVStr(Url),sizeof(Url),pre,Location);
	}
	if( itype == T_MAIL && !feof(in) ){
		if( ismbox ){
			itype = T_MBOX;
			Lap(1,0,"MBOX %s\n",apath);
		}
	}
	if( itype == T_MBOX ){
		Xstrcat(AVStr(Url),sub);
	}

/*
	strCRC32add(crc,path,strlen(path));
	strCRC32add(crc,"",1);
*/
/* CRC only for content is desired ? */
	addCRC(crc,path,strlen(path));
	addCRC(crc,"",1);

	fprintf(out,"<DOC HREF=\"%s\">\n",Url); /* encEnt("URL") */
	{
		if( isFullpath(path) || isFullURL(path) ){
			fprintf(out,"<DOCPATH>%s</DOCPATH>\n",path);
		}else{
			CStr(fpath,2048);
			strcpy(fpath,cwd);
			chdir_cwd(AVStr(fpath),apath,1);
			fprintf(out,"<DOCPATH>%s</DOCPATH>\n",fpath);
		}
	}

	if( strncasecmp(Author,"mailto:",7) == 0 )
		ovstrcpy(Author,Author+7);
	if( Author[0] ){
		if( strstr(Author,"=?") ){
			CStr(buf,256);
			MIME_strHeaderDecode(Author,AVStr(buf),sizeof(buf));
			CCXexec((CCXP)Ccx,buf,strlen(buf),AVStr(Author),sizeof(Author));
		}
	}
	if( Author[0] )
		withAuthor++;

	fprintf(out,"<TEXT WEIGHT=\"3\">\n"); /* encEnt("TEXTW3"); */

	dp = digest;
	dx = digest + (sizeof(digest)-1);
	if( Descr[0] ){
		wbstrcpy(AVStr(digest),dx,Descr,0);
		dp += strlen(digest);
		if( dp < dx )
			setVStrPtrInc(dp,' ');
	}

	skiphead = 0;
	pch = -1;
	ite = 0;
	prevb[0] = 0;
	noauth = Author[0] == 0;
	copymark = 0;
	for( ic = 0; ; ic++ ){
		if( fgets(inb,sizeof(inb),Tmp1) == NULL )
			break;
		for( ip = inb; ch = *ip; ip++ ){
			if( isspace(ch) && ch != '\n' )
				*(char*)ip = ' ';
		}
		addCRC(crc,inb,strlen(inb));
		encodeEntitiesX(inb,AVStr(outb),sizeof(outb));
		fputs(outb,out);

		if( noauth )
		if( Author[0] == 0 || copymark == 0 && strcasestr(inb,"&copy;") )
		if( prevb[0]
		 || itype == T_HTML && strcasestr(inb,"&copy;")
		 || strcasestr(inb,"Copyright")
		 || strcasestr(inb,"All Right")
		){
			CStr(auth,256);
			int mark = 0;
			auth[0] = 0;
			if( strlen(prevb) + strlen(inb) < sizeof(prevb) ){
				strcat(prevb,inb);
				if( strcasestr(prevb,"&copy;") )
					mark = 1;
				if( copy2auth(prevb,AVStr(auth),sizeof(auth),apath,0) ){
				}else{
					prevb[0] = 0;
				}
			}else{
				prevb[0] = 0;
			}
			if( auth[0] )
			if( Author[0] == 0 || strcmp(auth,Author) != 0 ){
				copymark = mark;				
				wbstrcpy(AVStr(Author),Author+(sizeof(Author)-1),auth,0);
			}
		}

		if( skiphead ){
			if( inb[0] == '\r' || inb[0] == '\n' )
				skiphead = 0;
		}else
		if( skiphead == 0 && dp < dx ){
			skip = skipLine(inb);
			skiphead = skip & 2;
			if( skip == 0 ){
				wassp = 1;
				for( ip = inb; ch = *ip; ip++ ){
					int nc = ip[1];
					if( (0x80 & ch) && (0x80 & nc) ){
						if( dp+1 < dx ){
							setVStrPtrInc(dp,ch);
							setVStrPtrInc(dp,nc);
							ip++;
							pch = ch;
						}
						wassp = 0;
					  	continue;
					}
					if( dx <= dp )
						break;
					if( issp = isspace(ch) )
						ch = ' ';
					if( !(wassp && issp) ){
						if( ch == pch )
							ite++;
						else	ite = 0;
						if( ite < 3 ){
							setVStrPtrInc(dp,ch);
							pch = ch;
						}
					}
					wassp = issp;
				}
			}
		}
	}
	setVStrEnd(dp,0);

	crc->crc = strCRC32end(crc->crc,crc->len);
	if( opt_v ){
		fprintf(stderr,"%08X\r\n",crc->crc);
	}
	fprintf(out,"\n</TEXT>\n");

	if( Author[0] == 0 )
	if( Address[0] != 0 )
	if( strcasestr(Address,"converted") == 0 )
	if( strcasestr(Address,"generated") == 0 )
	if( strstr(Address,"Exp") == 0 )
	{	const char *ap;

		if( ap = strcasestr(Address,"mail") ){
			ap += 4;
			if( *ap == ' ' )
				ap++;
			if( *ap == ':' )
				ap++;
			else	ap = Address;
		}else
		if( ap = strcasestr(Address," by ") ){
			ap += 4;
		}else{
			ap = Address;
		}
		copy2auth(ap,AVStr(Author),sizeof(Author),apath,1);
	}
	if( noauth && Author[0] ){
		guessedAuthor++;
	}
	if( XUri[0]  ) encEnt(crc,out,"URL","",XUri);
	if( Heads[0] ) encEnt(crc,out,"TEXT","WEIGHT=\"2\"",Heads);
	if( Descr[0] ) encEnt(crc,out,"TEXT","WEIGHT=\"1\"",Descr);
	if( XRefs[0] ) encEnt(crc,out,"LINK","",XRefs);
	if( Links[0] ) encEnt(crc,out,"LINK","",Links);
	if( digest[0]) encEnt(crc,out,"DIGEST","",digest);
	if( Author[0]) encEnt(crc,out,"AUTHOR","",Author);
	if( Title[0] ) encEnt(crc,out,"TITLE","",Title);
	if( Keywd[0] ) encEnt(crc,out,"KEYWORD","",Keywd);

	fprintf(out,"<METAX name=DATE>%X</DATE>\n",time(NULL));
	fprintf(out,"<METAX name=LASTMODIFIED>%X</META>\n",Mtime);
	fprintf(out,"<METAX name=LASTACCESSED>%X</META>\n",Atime);
	fprintf(out,"<METAX name=SIZE>%X</META>\n",Fsize);
	fprintf(out,"<METAX name=CRC32>%08X</META>\n",crc->crc);
	fprintf(out,"</DOC>\n");
fflush(out);

	makeDesc(&Desc,Fsize,Mtime,digest);
	if(descf){
		fwrite(&Desc,1,sizeof(Desc),descf);
		fflush(descf);
	}

	NumPut++;
	if( printIfText ){
		fprintf(stderr,"IsText: %s\n",Url);
	}

	if(Summf){
		fprintf(Summf,"%08X %08X %08X %08X %8d %s\n",soff,Atime,Mtime,
		crc->crc,Fsize,apath);
	}
	if( itypep )
		*itypep = itype;
	return E_OK;
}

void toMD5dots(PCStr(pfx),PCStr(str),PVStr(dots),int len){
	CStr(md5,64);
	refQStr(dp,dots); /**/
	const char *sp;
	int i;
	toMD5(str,md5);

	sp = md5;
	cpyQStr(dp,dots);
	if( pfx && *pfx ){
		strcpy(dots,pfx);
		dp += strlen(dp);
	}
	for(i = 0; i < len && *sp; i++){
		if( 0 < i && i % 2 == 0){
			setVStrPtrInc(dp,'.');
		}
		setVStrPtrInc(dp,*sp++);
	}
	setVStrEnd(dp,0);
}
static void scanRefs(PCStr(irefs),PVStr(orefs),int osize){
	int ri;
	const char *xp;
	const char *rp;
	CStr(ref1,256);
	refQStr(op,orefs); /**/
	CStr(xref1,256);

	xp = orefs + (osize-1);
	rp = irefs;

	setVStrEnd(orefs,0);
	for( ri = 0; ; ri++ ){
		rp = strchr(rp,'<');
		if( rp == NULL )
			break;
		rp = wordscanY(rp,AVStr(ref1),sizeof(ref1),"^>");
		if( *rp == '>' ){
			if( 0 < ri && op < xp ){
				setVStrPtrInc(op,' ');
			}
			strcat(ref1,">");
			QStrncpy(op,ref1,xp-op);
			op += strlen(op);

			toMD5dots("x-uri.",ref1,AVStr(xref1),32);
			QStrncpy(op,xref1,xp-op);
			op += strlen(op);
		}
	}
}

void scanMail(FILE *in,FILE *out,PCStr(apath),int ismbox,int lev)
{	CStr(line,1024);
	CStr(xline,1024);
	CStr(fnam,32);
	CStr(fval,1024);
	CStr(yline,2*1024);
	const char *dp;
	const char *ep;
	int li;
	int inUU;
	int topoff;
	int deMime = 0;
	MrefQStr(lkp,Links); /**/
	const char *lkx = Links+(sizeof(Links)-1);

	topoff = ftell(in);
	while( RFC822_fgetsHeaderField(AVStr(line),sizeof(line),in) != NULL ){
		if( *line == '\r' || *line == '\n' )
			break;

		scan_field1(line,AVStr(fnam),sizeof(fnam),AVStr(fval),sizeof(fval));
		if( strcaseeq(fnam,"Content-Type") ){
			if( strncaseeq(fval,"multipart/",10) ){
				if( lev == 0 ){
					deMime |= 1;
					break;
				}
			}
		}else
		if( strcaseeq(fnam,"Content-Transfer-Encoding") ){
			if( strcasestr(fval,"quoted-printable")
			 || strcasestr(fval,"base64") ){
				if( lev == 0 ){
					deMime |= 2;
					break;
				}
			}
		}else
		if( strcaseeq(fnam,"Subject")
		 || strcaseeq(fnam,"From") ){
			if( hide_addr )
				MIME_rewaddrs(hide_addr,AVStr(line));
			MIME_strHeaderDecode(line,AVStr(xline),sizeof(xline));
			if( CCXdisable ){
				scan_field1(xline,AVStr(fnam),sizeof(fnam),AVStr(fval),sizeof(fval));
			}else{
				CCXexec((CCXP)Ccx,xline,strlen(xline),AVStr(yline),sizeof(yline));
				scan_field1(yline,AVStr(fnam),sizeof(fnam),AVStr(fval),sizeof(fval));
			}
		}
		if( strcaseeq(fnam,"Message-Id") ){
			int len;
			linescanX(fval,AVStr(XUri),sizeof(XUri));

			len = strlen(XUri);
			toMD5dots("x-uri.",XUri,QVStr(XUri+len,XUri),32);
		}else
		if( strcaseeq(fnam,"References") ){
			scanRefs(fval,AVStr(XRefs),sizeof(XRefs));
		}else
		if( strcaseeq(fnam,"Subject") ){
			linescanX(fval,AVStr(Title),sizeof(Title));
		}else
		if( strcaseeq(fnam,"From") ){
			linescanX(fval,AVStr(Author),sizeof(Author));
		}else
		if( strcaseeq(fnam,"Date") ){
			Mtime = scanNNTPtime(fval);
		}
	}

	if( lev == 0 && deMime ){
		FILE *Tmp2;
		Tmp2 = getTmp(TMP2);
		fseek(in,topoff,0);

 if( ftell(in) != topoff ){
  fflush(in);
  fseek(in,topoff,0);
  fprintf(stderr,"## fseek retried: %d %d\n",topoff,ftell(in));
 }

		if( ismbox ){
			FILE *Tmp5 = getTmp(TMP5);
			int li;
			for(li = 0; fgets(line,sizeof(line),in) != NULL; li++){
				if( li != 0 )
				if( strncmp(line,"From ",5) == 0 ){
					backseek(in,strlen(line));
					break;
				}
				fputs(line,Tmp5);
			}
			fflushTmp(Tmp5);
			in = Tmp5;
		}

		PGPdecodeMIME(in,Tmp2,NULL,0x2FF,0,0);
		fflushTmp(Tmp2);
		scanMail(Tmp2,out,apath,ismbox,lev+1);
		return;
	}

	inUU = 0;
	for( li = 0; fgets(xline,sizeof(xline),in) != NULL; li++ ){
		if( ismbox ){
			if( strncmp(xline,"From ",5) == 0 ){
				backseek(in,strlen(xline));
				break;
			}
		}
		if( inUU == 0 ){
			if( strncmp(xline,"begin ",6) == 0 )
			if( isdigit(xline[6]) )
			{
				inUU = 1;
				continue;
			}
			if( dp = strstr(xline,"<URL:") ){
				dp += 5;
				if( ep = strchr(dp,'>') ){
					if( ep-dp+1 < lkx-lkp ){
						QStrncpy(lkp,dp,ep-dp+1);
						lkp += strlen(lkp);
						setVStrPtrInc(lkp,'\n');
						setVStrEnd(lkp,0);
					}
				}
			}
		}else{
			if( strncmp(xline,"end",3) == 0 ){
				inUU = 0;
				continue;
			}
			continue;
		}
		if( CCXdisable ){
			fputs(xline,out);
		}else{
			CCXexec((CCXP)Ccx,xline,strlen(xline),AVStr(yline),sizeof(yline));
			fputs(yline,out);
		}
	}
}
static void scanalpha(FILE *in,PVStr(str),int size)
{	int i,ch;

	for( i = 0; i < size-1; i++ ){
		ch = getc(in);
		if( !isalpha(ch) ){
			ungetc(ch,in);
			break;
		}
		setVStrElem(str,i,ch); /**/
	}
	setVStrEnd(str,i);
}

#define	BUFFED	(0x80000000|' ')

int scanSRC(PCStr(src),PVStr(href),int size);
int scanHREF(PCStr(src),PVStr(href),int size);
int scanLink(PCStr(src),PVStr(nam),int nsiz,PVStr(con),int csiz);
int skipComment(FILE *in,PVStr(comment),int siz);

void scanHtml(FILE *in,FILE *tmp,PCStr(apath))
{	int ch,pch,end,ech;
	const char *dp;
	CStr(tag,0x10000);
	CStr(tagn,32);
	CStr(attrn,32);
	CStr(nam,32);
	CStr(con,1024);
	int inHead = 0;
	int inTitle = 0;
	int inScript = 0;
	int inStyle = 0;
	int inHeading = 0;
	int inAddress = 0;
	int preisspace = 0;
	MrefQStr(tip,Title); /**/
	const char *tix = Title+(sizeof(Title)-1);
	MrefQStr(htp,Heads); /**/
	const char *htx = Heads+(sizeof(Heads)-1);
	MrefQStr(adp,Author); /**/
	const char *adx = Author+(sizeof(Author)-1);
	MrefQStr(ddp,Address); /**/
	const char *ddx = Address+(sizeof(Address)-1);
	MrefQStr(lkp,Links); /**/
	const char *lkx = Links+(sizeof(Links)-1);
	int bodyoff = ftell(tmp);
	int pushch = -1;
	CStr(buffed,128);
	CStr(authcand,128);
	int asis = 0;
	int ite = 0;
	int ic;
	CStr(buf,256);

	authcand[0] = 0;
	pch = 0;
	asis = 0;
	for( ic = 0; ; ic++ ){
	    if( pushch != -1 ){
		ch = pushch;
		pushch = -1;
		goto INCHAR;
	    }else
	    if( (ch = getc(in)) == EOF ){
		break;
	    }
	    if( asis ){
		asis = 0;
		goto INCHAR;
	    }

	    if( 0 < inScript || 0 < inStyle ){
		if( ch == '<' ){
			ch = getc(in);
			if( ch == '/' ){
				scanalpha(in,AVStr(tagn),8);
				if( inScript && strcasecmp(tagn,"SCRIPT") == 0
				 || inStyle  && strcasecmp(tagn,"STYLE") == 0
				){
					ch = getc(in);
					if( ch == '>' ){
						if( strcaseeq(tagn,"SCRIPT") )
							inScript--;
						if( strcaseeq(tagn,"STYLE") )
							inStyle--;
					}
				}
			}
		}
            }else
	    if( ch == '<' ){
		ch = getc(in);
		if( ch == EOF )
			break;
		if( ch == '!' ){
			CStr(comment,64);
			if( skipComment(in,AVStr(comment),sizeof(comment)) == EOF )
				break;
			if( strcaseeq(comment,"--X-Body-of-Message--")
			 || strcaseeq(comment,"-- body=\"start\" --")
			 || strcaseeq(comment,"--beginarticle--")
			){
				fseek(tmp,bodyoff,0);
			}

			if( strncasecmp(comment,"-- email=",9) == 0 ){
				valuescanX(comment+9,AVStr(authcand),sizeof(authcand));
			}
			if( strncasecmp(comment,"--X-From-R13:",13) == 0 ){
				const char *xp;
				lineScan(comment+13,authcand);
				if( xp = strstr(authcand,"--") )
					truncVStr(xp);
				strrot13(authcand);
			}
			continue;
		}
		if( ch == '?' ){
			while( !feof(in) ){
				if( getc(in) == '?' )
				if( getc(in) == '>' )
					break;
			}
		}

		if( end = (ch == '/') ){
			ch = getc(in);
		}
		if( !isalpha(ch) ){
			pushch = '<';
			ungetc(ch,in);
			if( end )
				ungetc('/',in);

			if( opt_d )
			fprintf(stderr,"## UNGET TAG <%s%c ##\n",end?"/":"",ch);
			continue;
		}
		ech = fscanTag(in,ch,AVStr(tag),sizeof(tag));
		if( ech != '>' ){
			if( opt_d )
			fprintf(stderr,"## UNGET TAG <%s%s ##\n",end?"/":"",tag);
		}
		dp = wordScan(tag,tagn);

		if( strcasecmp(tagn,"HEAD") == 0 ){
			inHead += end ? 0 : 1;
		}else
		if( strcasecmp(tagn,"SCRIPT") == 0 ){
			inScript += end ? -1 : 1;
		}else
		if( strcasecmp(tagn,"STYLE") == 0 ){
			inStyle += end ? -1 : 1;
		}else
		if( strcasecmp(tagn,"TITLE") == 0 ){
			inTitle = end ? 0 : 1;
		}else
		if( (tagn[0]=='H' || tagn[0]=='h') && isdigit(tagn[1]) ){
			if( !end && htp != Heads && htp < htx )
				setVStrPtrInc(htp,',');
			inHeading = end ? 0 : atoi(tag+1);
		}else
		if( strcasecmp(tagn,"LINK") == 0 ){
			if( scanLink(dp,AVStr(nam),sizeof(nam),AVStr(con),sizeof(con))==0 )
			if( strcasecmp(nam,"MADE") == 0 ){
				decodeEntitiesX(con,AVStr(Author),sizeof(Author),1);
				if( adp = strchr(Author,'?') )
					setVStrPtrInc(adp,0);
				adp = Author + strlen(Author);
			}
		}else
		if( strcasecmp(tagn,"META") == 0 ){
			if( scanMeta(dp,AVStr(nam),sizeof(nam),AVStr(con),sizeof(con)) == 0 ){
			if( strcasecmp(nam,"AUTHOR") == 0 ){
				decodeEntitiesX(con,AVStr(Author),sizeof(Author),1);
				adp = Author + strlen(Author);
			}else
			if( strcasecmp(nam,"COPYRIGHT") == 0 ){
				decodeEntitiesX(con,QVStr(ddp,Address),ddx-ddp,1);
				ddp += strlen(ddp);
			}else
			if( strcasecmp(nam,"SUBJECT") == 0 ){
				decodeEntitiesX(con,AVStr(Title),sizeof(Title),1);
				tip += strlen(tip);
			}else
			if( strcasecmp(nam,"DESCRIPTION") == 0 ){
				decodeEntitiesX(con,AVStr(Descr),sizeof(Descr),1);
			}else
			if( strcasecmp(nam,"KEYWORDS") == 0 ){
				decodeEntitiesX(con,AVStr(Keywd),sizeof(Keywd),1);
			}
			else
			if( strcasecmp(nam,"Content-Location") == 0 ){
				decodeEntitiesX(con,AVStr(Location),sizeof(Location),1);
			}
			else
			if( strcasecmp(nam,"Last-Modified") == 0 ){
				int mtime = scanHTTPtime(con);
				if( mtime == 0 || mtime == -1 ){
					fprintf(stderr,"Illegal Date: %s %s\n",
						con,apath);
					Mtime = 1;
				}else{
					Mtime = mtime;
				}
			}
			}
		}else
		if( strcaseeq(tagn,"A") && scanHREF(dp,AVStr(con),sizeof(con))==0
		 || strcaseeq(tagn,"FRAME") && scanSRC(dp,AVStr(con),sizeof(con))==0
		){
			if( strncmp(con,"http://",7) == 0 )
			if( strlen(con) < lkx-lkp-1 ){
				QStrncpy(lkp,con,lkx-lkp);
				lkp += strlen(lkp);
				setVStrPtrInc(lkp,'\n');
			}
			if( authcand[0] == 0 )
			/* if( inHead || inAddress ) */
			if( strncasecmp(con,"mailto:",7) == 0 ){
				const char *qp;
				if( qp = strchr(con,'?') )
					truncVStr(qp);
				wbstrcpy(AVStr(authcand),authcand+(sizeof(authcand)-1),con,0);
			}
		}else
		if( strcasecmp(tagn,"ADDRESS") == 0 ){
			inAddress = end ? 0 : 1;
		}
	    }else
	    if( ch == '&' ){
		CStr(buf,16);
		int bi = 0;
		setVStrElemInc(buf,bi,ch); /**/
		while( bi < sizeof(buf)-1 ){
			ch = getc(in);
			if( ch == EOF )
				break;
			setVStrElemInc(buf,bi,ch); /**/
			if( ch != '#' && !isalnum(ch) )
				break;
		}
		buf[bi] = 0;
		if( bi == 6
		 && ch != ';'
		 && strchr(" &<>",ch)
		 && strncaseeq(buf,"&nbsp",5)
		){
			pushch = BUFFED;
			strcpy(buffed," ");
			ungetc(ch,in); /* maybe typo */
		}else
		if( ch == ';' ){
			if( strcaseeq(buf,"&nbsp;") ){
				pushch = ' ';
			}else{
				CStr(ob,32);
				decodeEntitiesX(buf,AVStr(ob),sizeof(ob),1);
				if( ob[0] != '&' || ob[1] == 0 )
					pushch = ob[0];
				else{
					if( ob[1] == '#' ){
					}else{
						pushch = BUFFED;
						FStrncpy(buffed,buf);
					}
				}
			}
		}else{
			backseek(in,strlen(buf));
			asis = 1;
			if( opt_d )
			fprintf(stderr,"## UNGET ENT [%s%c] ##\n",buf,ch);
		}
	    }else INCHAR:{
		if( ch != '\n' && isspace(ch) && preisspace ){
		}else{
			preisspace = isspace(ch);
			if( inTitle ){
				if( tip < tix )
				if( ch != '\n' && ch != '\r' )
					setVStrPtrInc(tip,ch);
				continue;
			}
			if( inHeading ){
				if( htp < htx )
				if( ch != '\n' && ch != '\r' )
					setVStrPtrInc(htp,ch);
			}
			if( inAddress ){
				if( ddp < ddx ){
					if( ch == '\n' || ch == '\r' )
						ch = ' ';
					if( ch != ' ' || pch != ' ' )
						setVStrPtrInc(ddp,ch);
				}
			}
			if( ch == '\n' && ite ){
			}else{
				if( ch == pch )
					ite++;
				else	ite = 0;
				if( ch == BUFFED ){
					fputs(buffed,tmp);
				}else{
					if(ch==CH_COPYR && !CCXwithJP((CCXP)Ccx)){
						fputs("&copy;",tmp);
					}else{
						putc(ch,tmp);
					}
				}
				pch = ch;
			}
		}
	    }
	}
	setVStrEnd(tip,0);
	setVStrEnd(htp,0);
	setVStrEnd(adp,0);
	setVStrEnd(ddp,0);
	setVStrEnd(lkp,0);

	if( authcand[0] ){
		if( Author[0] ){
		}else{
			wbstrcpy(AVStr(Author),Author+(sizeof(Author)-1),authcand,0);
		}
	}
	if( Author[0] ){
		url_unescape(AVStr(Author),AVStr(Author),sizeof(Author),"\"@ <>/?");
	}
}

/* to be moved into library */
int scanAttrs(PCStr(src),int an,PCStr(nam1),PVStr(val1),int vsiz1,PCStr(nam2),PVStr(val2),int vsiz2)
{	const char *nam;
	refQStr(val,val1); /**/
	const char *dp;
	int ai,vsiz,got;

	got = 0;
	dp = src;
	for( ai = 0; ai < an; ai++ ){
		while( isspace(*dp) ) dp++;
		if( *dp == '>' )
			break;
		if( strncasecmp(dp,nam1,strlen(nam1)) == 0 ){
			nam = nam1; cpyQStr(val,val1); vsiz = vsiz1;
		}else
		if( nam2 && strncasecmp(dp,nam2,strlen(nam2)) == 0 ){
			nam = nam2; cpyQStr(val,val2); vsiz = vsiz2;
		}else{
			int qch;
			for(; *dp && *dp != '='; dp++ );
			if( *dp != '=' )
				break;
			qch = *++dp;
			if( qch == '"' || qch == '\'' ){
				for( dp++; *dp && *dp != qch; dp++);
			}else{
				for(; *dp && !isspace(*dp); *dp++);
			}
			continue;
		}

		dp += strlen(nam);
		while( isspace(*dp) ) dp++;
		if( *dp == '=' ){
			dp++;
			while( isspace(*dp) ) dp++;
			dp = valuescanX(dp,ZVStr(val,vsiz),vsiz);
			if( *dp == '"' || *dp == '\'' )
				dp++;
			got++;
		}
	}
	return got;
}
int scanHREF(PCStr(src),PVStr(href),int size)
{
	setVStrEnd(href,0);
	if( scanAttrs(src,4, "HREF",AVStr(href),size, NULL,VStrNULL,0) == 1 )
		return 0;
	return -1;
}
int scanSRC(PCStr(src),PVStr(href),int size)
{
	setVStrEnd(href,0);
	if( scanAttrs(src,4, "SRC",AVStr(href),size, NULL,VStrNULL,0) == 1 )
		return 0;
	return -1;
}
int scanLink(PCStr(src),PVStr(nam),int nsiz,PVStr(con),int csiz)
{ 
	setVStrEnd(con,0);
	setVStrEnd(nam,0);
	if( scanAttrs(src,4, "REL",AVStr(nam),nsiz, "HREF",AVStr(con),csiz) == 2
	 || scanAttrs(src,4, "REV",AVStr(nam),nsiz, "HREF",AVStr(con),csiz) == 2
	)
		return 0;
	return -1;
}
int scanMeta(PCStr(src),PVStr(nam),int nsiz,PVStr(con),int csiz)
{ 
	setVStrEnd(con,0);
	setVStrEnd(nam,0);
	if( scanAttrs(src,4, "NAME",AVStr(nam),nsiz, "CONTENT",AVStr(con),csiz) == 2 )
		return 0;
	if( scanAttrs(src,4, "HTTP-EQUIV",AVStr(nam),nsiz, "CONTENT",AVStr(con),csiz) == 2 )
		return 0;
	return -1;
}
int fscanTag(FILE *in,int ch,PVStr(tag),int tsiz)
{	refQStr(tp,tag); /**/
	const char *tx;
	char quote;
	int pch = 0;

	quote = 0;
	cpyQStr(tp,tag);
	tx = tag + (tsiz-1);
	while( tp < tx && ch != EOF ){
		if( ch == '\'' || ch == '"' ){
			if( quote && quote != ch ){
			}else
			if( quote == 0 )
				quote = ch;
			else	quote = 0;
		}
		if( quote == 0 && ch == '>' )
			break;

		if( ch == '\n' || ch == '\r' )
			ch = ' ';
		if( !(isspace(ch) && isspace(pch)) )
			setVStrPtrInc(tp,ch);
		pch = ch;
		ch = getc(in);
	}
	setVStrEnd(tp,0);
	return ch;
}
int skipComment(FILE *in,PVStr(comment),int siz)
{	int ch1,ch,pch,comlev;
	refQStr(cp,comment); /**/
	const char *xp;

	ch1 = -1;
	pch = -1;
	comlev = 1;
	xp = comment + (siz-1);
	cpyQStr(cp,comment);
	while( (ch = getc(in)) != EOF ){
		if( ch1 == -1 )
			ch1 = ch;

		if( pch == '<' && (ch == '-' || ch1 != '-') ){
			comlev++;
		}else
		if( (pch == '-' || ch1 != '-') && ch == '>' ){
			comlev--;
			if( comlev == 0 )
				break;
		}
		pch = ch;
		if( cp < xp ){
			setVStrPtrInc(cp,ch);
		}
	}
	setVStrEnd(cp,0);
	return ch;
}
const char *strskip(PCStr(s),PCStr(p))
{	const char *tp;

	if( tp = strstr(s,p) )
		return tp + strlen(p);
	else	return 0;
}

static FILE *Tmps[8];
FILE *getTmp(int ti)
{
	if( Tmps[ti] == NULL )
		Tmps[ti] = tmpfile();
	else{
		clearerr(Tmps[ti]);
		fseek(Tmps[ti],0,0);
	}
	return Tmps[ti];
}
void fflushTmp(FILE *fp)
{
	fflush(fp);
	ftruncate(fileno(fp),(off_t)ftell(fp));
	fseek(fp,0,0);
}
int sysfilter(PCStr(filter),FILE *in,FILE *out)
{	int sv0,sv1;
	FILE *pfp;

	sv0 = dup(0); dup2(fileno(in),0);
	sv1 = dup(1); dup2(fileno(out),1);
	system(filter);
	wait(0);
	dup2(sv1,1); close(sv1);
	dup2(sv0,0); close(sv0);
	return 0;
}

void getCharset(PCStr(where),PVStr(charset),PCStr(fval))
{	const char *dp;

	if( dp = strcasestr(fval,"charset=") ){
		valuescanX(dp+8,AVStr(charset),32);
		if( strcaseeq(charset,"none")
		 || strcaseeq(charset,"guess")
		){
			setVStrEnd(charset,0);
		}
	}
}
int extractAuthor(PCStr(str),PCStr(top),PVStr(author),int size,PCStr(url),int dump)
{	const char *cp;
	char ch;
	char pch;
	refQStr(dp,author); /**/

	cp = (char*)top; /* cp for top is "const" but reuse for non-const */
	while( isspace(*cp) )
		cp++;

	if( 128 < size )
		size = 128;
	wbstrcpy(AVStr(author),author+(size-1),cp,0);

	pch = 0;
	cpyQStr(dp,author);
	for( cp = author; ch = *cp; cp++ ){
		if( ch == '\n' && isspace(pch) )
			continue;
		if( isspace(ch) && pch == '\n' )
			continue;
		pch = ch;
		if( ch == '\n' )
			ch = ' ';
		setVStrPtrInc(dp,ch);
	}
	setVStrEnd(dp,0);

	if( cp = strcasestr(author,"All Right") )
		truncVStr(cp);
	for( cp = author; *cp; cp++ ){
		if( strchr("0123456789/-.,() ",*cp) == 0 )
			break;
	}
	if( cp != author )
		strcpy(author,cp);

	for( cp = author; *cp; ){
		if( *cp == ':'
		 || *cp == ';'
		 || *cp == '('
		 || *cp == '|'
		 || *cp == '.' && cp[1] == ' '
		 || *cp == ' ' && cp[1] == ' '
		 || *cp == '-' && cp[1] == ' '
		){
			truncVStr(cp);
			break;
		}
		cp++;
	}
	for( cp = author+strlen(author)-1; author <= cp; cp-- ){
		ch = *cp;
		if( strchr(" ,.0123456789-/",ch) == 0 )
			break;
		if( isdigit(ch) && author < cp && isalpha(cp[-1]) )
			break;
		truncVStr(cp);
	}
	if( strncmp(author,"by ",3) == 0 )
		ovstrcpy((char*)author,author+3);
	if( strcmp(author,"by") == 0 )
		setVStrEnd(author,0);

	if( opt_d || dump )
	fprintf(stderr,"#%d#AUTHOR[%s]%d\n#%d#[%s]%s\n",
		NumPut,author,guessedAuthor, NumPut,str,url);

	if( author[0] != 0 ){
		int nsp = 0;
		const char *ap;
		for( ap = author; *ap; ap++ ){
			if( *ap == ' ' ){
				++nsp;
				if( 4 < nsp && 40 < (ap-author) ){
					truncVStr(ap);
					break;
				}
			}
		}
		return 0;
	}else{
		return 1;
	}
}
int copy2auth(PCStr(copyr),PVStr(author),int size,PCStr(url),int force)
{	const char *cp;
	const char *np;
	const char *acp;

	int ok = 0;
	int dump = 0;

	if( cp = strstr(copyr,"&copy;") ){
		ok = 1;
		acp = strstr(copyr,"by ");
		if( acp && acp < cp ){ /* by xxx &copy; */
			truncVStr(cp);
			cp = acp;
		}else{
			cp += 6;
			while( isspace(*cp) )
				cp++;
			if( strncaseeq(cp,"Copyright",9) )
				cp += 9;
		}
	}else
	if( cp = strcasestr(copyr,"Copyright") ){
		cp += 9;
		if( strneq(cp,"ed",2) )
			cp += 2;
		if( *cp == ':' )
			cp++;
		for(;;){
			while( isspace(*cp) )
				cp++;
			if( strncasecmp(cp,"(C)",3) == 0 ){
				cp += 3;
				ok = 1;
				break;
			}
			if( (0xFF & *cp) == CH_COPYR ){ /* &copy; */
				cp += 1;
				ok = 1;
				break;
			}
			if( isdigit(*cp) ){
				int y = atoi(cp);
				if( 1900 < y && y < 2100 ){
					cp += 4;
					ok = 1;
					continue;
				}
			}
			if( np = strcasestr(cp,"Copyright") ){
				cp = np + 9;
				continue;
			}
			break;
		}
		if( !ok ){
			if( strcasestr(cp,"All Right") != NULL ){
			/*
			fprintf(stderr,"#### ALLRIGHT %s",copyr);
			*/
			}else
			if( *cp == 0 ){
				return 1;
			}else{
				return 0;
			}
		}
	}else
	if( cp = strcasestr(copyr,"All Rights") ){
		while( isspace(*cp) )
			cp++;
		if( cp = strcasestr(cp,"Reserved by") ){
			cp += 11;
		}else{
			return 0;
		}
	}else
	if( force ){
		cp = (char*)copyr;
	}else{
		return 0;
	}
	return extractAuthor(copyr,cp,AVStr(author),size,url,dump);
}
char *wbstrcpy(PVStr(dst),PCStr(dx),PCStr(src),int len)
{	refQStr(dp,dst); /**/
	const char *sp;
	char ch;
	int ic,in2B;
/*
int l1,l2;
if(len == 0){
QStrncpy(dst,src,dx-dst+1);
l1 = strlen(dst);
}
*/
	cpyQStr(dp,dst);
	in2B = 0;
	ic = 1; /* reserve for terminator */
	for( sp = src; (len == 0 || ic < len) && (ch = *sp); sp++ ){
		if( dx <= dp )
			break;
		setVStrPtrInc(dp,ch);
		ic++;
		if( in2B == 0 ){
			if( ch & 0x80 )
				in2B = 1;
		}else{
			in2B = 0;
		}
	}
	if( in2B ){
		dp--;
	}
	setVStrPtrInc(dp,0);
/*
if(len == 0 ){
l2 = strlen(dst);
if(l1 != l2)printf("A=%d/B=%d/len=%d/in2B=%d\nA: %s\nB: %s\n",
l2,l1,strlen(src),in2B,src,dst);
}
*/
	return (char*)dp;
}

int toFullpathPATH(PCStr(searchpath),PCStr(file),PCStr(mode),PVStr(fullpath),int size)
{	CStr(pathenv,1024);
	const char **vpath;
	FILE *xfp;

	if( searchpath == 0 )
		return 0;
	lineScan(searchpath,pathenv);
	vpath = vect_PATH(pathenv);
	if( xfp = fopen_PATH(vpath,file,mode,AVStr(fullpath)/*,size*/) ){
		fclose(xfp);
		return 1;
	}
	return 0;
}
int toFullpathENV(PCStr(envname),PCStr(file),PCStr(mode),PVStr(execpath),int size)
{	const char *env;

	if( isFullpath(file) ){
		QStrncpy(execpath,file,size);
		return 3;
	}
	if( 0 <= File_size(file) ){
		getcwd((char*)execpath,size);
		chdir_cwd(AVStr(execpath),file,1);
		return 2;
	}
	linescanX(file,AVStr(execpath),size);
	if( env = getenv("PATH") ){
		return toFullpathPATH(env,execpath,"r",AVStr(execpath),size);
	}
	return 0;
}
int xrealpath(PCStr(path),PVStr(rpath),int size)
{	int len;
	const char *dp;
	CStr(link,256);

	len = readlink(path,link,sizeof(link));
	if( len <= 0 )
		return 0;
	link[len] = 0;

	linescanX(path,AVStr(rpath),size);
	if( dp = strrchr(rpath,'/') ){
		if( dp[1] != 0 )
			truncVStr(dp);
	}
	chdir_cwd(AVStr(rpath),link,1);
	return 1;
}

int isRFC822(FILE *fp)
{	int off,non,fnam,nlen,nf,ic,ch,rcc,ufrom;
	CStr(head,5);

	nf = 0;
	non = 0;
	fnam = 1;
	nlen = 1;

	off = ftell(fp);
	rcc = fread(head,1,5,fp);
	if( rcc != 5 )
		goto EXIT;
	if( ufrom = (strncmp(head,"From ",5) == 0) ){ /* Unix mailbox format */
		for(;;){
			ch = getc(fp);
			if( ch == EOF )
				goto EXIT;
			if( ch == '\n' )
				break;
		}
		ic = ftell(fp) - off;
	}else{
		fseek(fp,off,0);

if(off != ftell(fp)){
  fprintf(stderr,"isRFC822: SEEK ERROR %d %d\n",ftell(fp),off);
  sleep(1);
}
		ic = 0;
	}

	for(; ic < 1024; ic++ ){
		ch = getc(fp);
		if( ch == EOF )
			break;
		if( ch == 0 ){
			non = 1;
			break;
		}
		if( ch == '\r' ){
			ch = getc(fp);
			if( ch != '\n' ){
				non = 2;
				break;
			}
			ungetc(ch,fp);
			continue;
		}
		if( fnam ){
			if( nlen == 0 && ch == '\n' ){
				break;
			}
			if( isalnum(ch) || 0<nlen && ch=='-' ){
				nlen++;
			}else
			if( ch == ':' ){
				if( nlen == 0 ){
					non = 3;
					break;
				}
				fnam = 0;
			}else{
				non = 4;
				break;
			}
		}else{
			if( ch == '\n' ){
				ch = getc(fp);
				if( ch == ' ' || ch == '\t' ){
					/* continue */
				}else{
					nf++;
					if( 5 < nf )
						break;
					ungetc(ch,fp);
					fnam = 1;
					nlen = 0;
				}
			}
		}
	}
EXIT:
	if( feof(fp) )
		clearerr(fp);
	fseek(fp,off,0);
	if( !non && 1 < nf )
		return ic;

/*
 fprintf(stderr,"## ufrom=%d non=%d nf=%d ch=%X nlen=%d\n",
 ufrom,non,nf,ch,nlen);
*/
	return -ic;
}

void backseek(FILE *in, int disp){
	int soff = ftell(in);
	int diff;

#ifndef _MSC_VER
	fflush(in); /* to avoid loop on Linux */
#endif
	fseek(in,-disp,1);

	diff = soff - ftell(in);
	if( diff != disp ){
		fprintf(stderr,"#### backseek error %d %d\n",
			diff,disp);
		usleep(100000);
	}
}

static double Prev;
static double Prev1;
static void Lap(int force,int outlen,PCStr(fmt),...)
{	double Now;
	VARGS(8,fmt);

	Now = Time();
	if( Prev == 0 )
	{
		Prev = Now;
		Prev1 = Now;
	}
	if( !force && (Now-Prev) < 5 )
		return;

	fprintf(stderr,"+++ %6.2f %4.2f ", Now-Start,Now-Prev1);
	if( 0 < outlen ){
		fprintf(stderr,
			"%5d %5d (%5.1f docs/s / %6d bytes/s) %d bytes\n",
			NumAny,NumPut, NumPut/(Now-Start),
			(int)(outlen/(Now-Start)), outlen);
	}else
	if( fmt ){
		fprintf(stderr,fmt,VA8);
	}
	if( force == 0 )
	Prev = Now;
	Prev1 = Now;
}
