/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class CrawlDbFilter
extends Mapper<Text, CrawlDatum, Text, CrawlDatum> {
    public static final String URL_FILTERING = "crawldb.url.filters";
    public static final String URL_NORMALIZING = "crawldb.url.normalizers";
    public static final String URL_NORMALIZING_SCOPE = "crawldb.url.normalizers.scope";
    private boolean urlFiltering;
    private boolean urlNormalizers;
    private boolean url404Purging;
    private boolean purgeOrphans;
    private URLFilters filters;
    private URLNormalizers normalizers;
    private String scope;
    private Counter goneRecordsRemovedCounter;
    private Counter orphanRecordsRemovedCounter;
    private Counter urlsFilteredCounter;
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    private Text newKey = new Text();

    public void setup(Mapper.Context context) {
        Configuration conf = context.getConfiguration();
        this.urlFiltering = conf.getBoolean(URL_FILTERING, false);
        this.urlNormalizers = conf.getBoolean(URL_NORMALIZING, false);
        this.url404Purging = conf.getBoolean("db.update.purge.404", false);
        this.purgeOrphans = conf.getBoolean("db.update.purge.orphans", false);
        if (this.urlFiltering) {
            this.filters = new URLFilters(conf);
        }
        if (this.urlNormalizers) {
            this.scope = conf.get(URL_NORMALIZING_SCOPE, "crawldb");
            this.normalizers = new URLNormalizers(conf, this.scope);
        }
        this.initCounters(context);
    }

    private void initCounters(Mapper.Context context) {
        this.goneRecordsRemovedCounter = context.getCounter("nutch_crawldb_filter", "gone_records_removed_total");
        this.orphanRecordsRemovedCounter = context.getCounter("nutch_crawldb_filter", "orphan_records_removed_total");
        this.urlsFilteredCounter = context.getCounter("nutch_crawldb_filter", "urls_filtered_total");
    }

    public void map(Text key, CrawlDatum value, Mapper.Context context) throws IOException, InterruptedException {
        String url = key.toString();
        if (this.url404Purging && 3 == value.getStatus()) {
            this.goneRecordsRemovedCounter.increment(1L);
            return;
        }
        if (this.purgeOrphans && 8 == value.getStatus()) {
            this.orphanRecordsRemovedCounter.increment(1L);
            return;
        }
        if (url != null && this.urlNormalizers) {
            try {
                url = this.normalizers.normalize(url, this.scope);
            }
            catch (Exception e) {
                LOG.warn("Skipping {}: ", (Object)url, (Object)e);
                url = null;
            }
        }
        if (url != null && this.urlFiltering) {
            try {
                url = this.filters.filter(url);
            }
            catch (Exception e) {
                LOG.warn("Skipping {}: ", (Object)url, (Object)e);
                url = null;
            }
        }
        if (url == null) {
            this.urlsFilteredCounter.increment(1L);
        } else {
            this.newKey.set(url);
            context.write((Object)this.newKey, (Object)value);
        }
    }
}

