/* HtmlTokenizer.java */

/* 
 * Copyright (C) 1997 Mark Boyns <boyns@sdsu.edu>
 *
 * This file is part of Muffin.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
package muffin.html;

import java.io.InputStream;
import java.io.OutputStream;
import java.io.BufferedInputStream;
import java.io.PushbackInputStream;
import java.io.IOException;
import java.util.StringTokenizer;

public class HtmlTokenizer extends PushbackInputStream
{
    public static final int TT_EOF = -1;
    public static final int TT_TAG = -2;
    public static final int TT_TEXT = -4;
    public static final int TT_COMMENT = -8;

    private final int initialSize = 512;
    private byte buf[];
    private int offset = 0;

    private int bad[] = { '>', '<', '>' };
    private int badIndex = 0;
    
    
    public HtmlTokenizer (InputStream in)
    {
	super (in);
	buf = new byte[initialSize];
    }

    public void writeToken (OutputStream out) throws IOException
    {
	out.write (buf, 0, offset);
    }

    public void writeTag (Tag tag, OutputStream out) throws IOException
    {
	if (tag.isModified ())
	{
	    String s = tag.toString ();
	    out.write (s.getBytes (), 0, s.length ());
	}
	else
	{
	    out.write (buf, 0, offset);
	}
    }

    public String getTokenValue ()
    {
	return new String (buf, 0, offset);
    }

    public Tag getTag ()
    {
	int start = 0, end = 0, rest = 0;

	while (start < offset)
	{
	    switch (buf[++start])
	    {
	    case ' ':
	    case '\t':
	    case '\r':
	    case '\n':
	    case '>':
		continue;
	    }
	    break;
	}

	end = start;
 loop:  while (end < offset)
	{
	    switch (buf[++end])
	    {
	    case ' ':
	    case '\t':
	    case '\r':
	    case '\n':
		rest = end+1;
		break loop;
		
	    case '>':
		rest = -1;
		break loop;
	    }
	}

	Tag tag = new Tag ();
	tag.name = new String (buf, start, end - start).toLowerCase ();
	if (rest > 0)
	{
	    tag.contents = buf;
	    tag.contentsIndex = rest;
	    tag.contentsEnd = offset - 1; // strip '>'
	}
	return tag;
    }

    public int getToken () throws IOException
    {
	int type = 0;
	int ch;
	boolean quoted = false;
	int quoteChar = 0;
	
	offset = 0;
	while ((ch = read ()) != -1)
	{
	    if (type != TT_COMMENT)
	    {
		/* look for end quote */
		if (quoted)
		{
		    if (ch == quoteChar)
		    {
			quoted = false;
		    }
		    else if (ch == bad[badIndex])
		    {
			badIndex++;
			if (badIndex == bad.length)
			{
			    badIndex = 0;
			    quoted = false;
			    System.out.println (getClass ().getName () + " Bad HTML!");
			}
		    }
		}
		/* look for start tag */
		else if (ch == '<')
		{
		    if (type != 0)
		    {
			unread (ch);
			return type;
		    }
		    type = TT_TAG;
		}
		/* look for start quote */
		else if (type == TT_TAG && (ch == '"' || ch == '\''))
		{
		    quoted = true;
		    quoteChar = ch;
		}
		/* otherwise it's text */
		else if (type == 0)
		{
		    type = TT_TEXT;
		}
	    }

	    if (offset == buf.length)
	    {
		byte tmpbuf[] = buf;
		buf = new byte[tmpbuf.length * 2];
		System.arraycopy (tmpbuf, 0, buf, 0, offset);
	    }
	    buf[offset++] = (byte) ch;

	    /* see if the tag is really a comment */
	    if (type == TT_TAG && offset == 4)
	    {
		if (buf[0] == '<'
		    && buf[1] == '!'
		    && buf[2] == '-'
		    && buf[3] == '-')
		{
		    type = TT_COMMENT;
		}
	    }

	    /* look for end tag */
	    if (ch == '>' && !quoted && (type == TT_TAG || type == TT_COMMENT))
	    {
		if (type == TT_COMMENT)
		{
		    if (buf[offset-1] == '>'
			&& buf[offset-2] == '-'
			&& buf[offset-3] == '-')
		    {
			break;
		    }
		}
		else
		{
		    break;
		}
	    }
	}

	return type == 0 ? TT_EOF : type;
    }
}
