/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.parsefilter.naivebayes;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parsefilter.naivebayes.Classify;
import org.apache.nutch.parsefilter.naivebayes.Train;
import org.apache.nutch.protocol.Content;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;

public class NaiveBayesParseFilter
implements HtmlParseFilter {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    public static final String TRAINFILE_MODELFILTER = "parsefilter.naivebayes.trainfile";
    public static final String DICTFILE_MODELFILTER = "parsefilter.naivebayes.wordlist";
    private Configuration conf;
    private String inputFilePath;
    private String dictionaryFile;
    private ArrayList<String> wordlist = new ArrayList();

    public boolean filterParse(String text) {
        try {
            return this.classify(text);
        }
        catch (IOException e) {
            LOG.error("Error occured while classifying:: {} ::", (Object)text, (Object)e);
            return false;
        }
    }

    public boolean filterUrl(String url) {
        return this.containsWord(url, this.wordlist);
    }

    public boolean classify(String text) throws IOException {
        return Classify.classify(text).equals("1");
    }

    public void train() throws Exception {
        if (!FileSystem.get((Configuration)this.conf).exists(new Path("naivebayes-model"))) {
            LOG.info("Training the Naive Bayes Model");
            Train.start(this.inputFilePath);
        } else {
            LOG.info("Model file already exists. Skipping training.");
        }
    }

    public boolean containsWord(String url, ArrayList<String> wordlist) {
        for (String word : wordlist) {
            if (!url.contains(word)) continue;
            return true;
        }
        return false;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
        this.inputFilePath = conf.get(TRAINFILE_MODELFILTER);
        this.dictionaryFile = conf.get(DICTFILE_MODELFILTER);
        if (this.inputFilePath == null || this.inputFilePath.trim().length() == 0 || this.dictionaryFile == null || this.dictionaryFile.trim().length() == 0) {
            String message = "ParseFilter: NaiveBayes: trainfile or wordlist not set in the parsefilte.naivebayes.trainfile or parsefilte.naivebayes.wordlist";
            LOG.error(message);
            throw new IllegalArgumentException(message);
        }
        try {
            String CurrentLine;
            if (FileSystem.get((Configuration)conf).exists(new Path(this.inputFilePath)) || FileSystem.get((Configuration)conf).exists(new Path(this.dictionaryFile))) {
                String message = "ParseFilter: NaiveBayes: " + this.inputFilePath + " or " + this.dictionaryFile + " not found!";
                LOG.error(message);
                throw new IllegalArgumentException(message);
            }
            BufferedReader br = null;
            Reader reader = conf.getConfResourceAsReader(this.dictionaryFile);
            br = new BufferedReader(reader);
            while ((CurrentLine = br.readLine()) != null) {
                this.wordlist.add(CurrentLine);
            }
        }
        catch (IOException e) {
            LOG.error(StringUtils.stringifyException((Throwable)e));
        }
        try {
            this.train();
        }
        catch (Exception e) {
            LOG.error("Error occured while training::", (Throwable)e);
        }
    }

    public Configuration getConf() {
        return this.conf;
    }

    public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
        Parse parse = parseResult.get(content.getUrl());
        String url = content.getBaseUrl();
        ArrayList<Outlink> tempOutlinks = new ArrayList<Outlink>();
        String text = parse.getText();
        if (!this.filterParse(text)) {
            int i;
            LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: {}", (Object)url);
            LOG.info("Checking outlinks");
            Outlink[] out = null;
            for (i = 0; i < parse.getData().getOutlinks().length; ++i) {
                LOG.info("ParseFilter: NaiveBayes: Outlink to check:: {}", (Object)parse.getData().getOutlinks()[i].getToUrl());
                if (this.filterUrl(parse.getData().getOutlinks()[i].getToUrl())) {
                    tempOutlinks.add(parse.getData().getOutlinks()[i]);
                    LOG.info("ParseFilter: NaiveBayes: found relevant");
                    continue;
                }
                LOG.info("ParseFilter: NaiveBayes: found irrelevant");
            }
            out = new Outlink[tempOutlinks.size()];
            for (i = 0; i < tempOutlinks.size(); ++i) {
                out[i] = (Outlink)tempOutlinks.get(i);
            }
            parse.getData().setOutlinks(out);
        } else {
            LOG.info("ParseFilter: NaiveBayes: Page found relevant:: {}", (Object)url);
        }
        return parseResult;
    }
}

