/*
 * Decompiled with CFR 0.152.
 */
package org.carrot2.clustering.synthetic;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang3.ArrayUtils;
import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.ProcessingComponentBase;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.CommonAttributes;
import org.carrot2.core.attribute.Internal;
import org.carrot2.core.attribute.Processing;
import org.carrot2.shaded.guava.common.collect.LinkedHashMultimap;
import org.carrot2.shaded.guava.common.collect.Lists;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Output;

@Bindable(inherit={CommonAttributes.class})
@Label(value="By URL Clustering")
public class ByUrlClusteringAlgorithm
extends ProcessingComponentBase
implements IClusteringAlgorithm {
    private static final Set<String> STOP_URL_PARTS = new HashSet<String>();
    @Processing
    @Input
    @Internal
    @Attribute(key="documents", inherit=true)
    public List<Document> documents;
    @Processing
    @Output
    @Internal
    @Attribute(key="clusters", inherit=true)
    public List<Cluster> clusters = null;

    @Override
    public void process() throws ProcessingException {
        Document[] documentArray = this.documents.toArray(new Document[this.documents.size()]);
        String[][] urlParts = this.buildUrlParts(documentArray);
        ArrayList<Integer> documentIndexes = new ArrayList<Integer>(documentArray.length);
        for (int i = 0; i < documentArray.length; ++i) {
            documentIndexes.add(i);
        }
        this.clusters = this.createClusters(documentArray, documentIndexes, urlParts, 0, "");
        if (this.clusters.size() == 0) {
            Cluster.appendOtherTopics(this.documents, this.clusters, "Other Sites");
        }
    }

    private List<Cluster> createClusters(Document[] documents, Collection<Integer> documentIndexes, String[][] urlParts, int level, String labelSuffix) {
        LinkedHashMultimap urlPartToDocumentIndex = LinkedHashMultimap.create();
        for (Integer documentIndex : documentIndexes) {
            String[] urlPartsForDocument = urlParts[documentIndex];
            if (urlPartsForDocument == null || urlPartsForDocument.length <= level || STOP_URL_PARTS.contains(urlPartsForDocument[level])) continue;
            urlPartToDocumentIndex.put(urlPartsForDocument[level], (Object)documentIndex);
        }
        LinkedHashSet documentsInClusters = new LinkedHashSet();
        ArrayList<Cluster> clusters = new ArrayList<Cluster>();
        for (String urlPart : urlPartToDocumentIndex.keySet()) {
            Collection indexes = urlPartToDocumentIndex.get((Object)urlPart);
            if (indexes.size() <= 1) continue;
            Cluster cluster = new Cluster();
            String clusterLabel = urlPart + (labelSuffix.length() > 0 ? "." + labelSuffix : "");
            List<Cluster> subclusters = this.createClusters(documents, indexes, urlParts, level + 1, clusterLabel);
            if (subclusters.size() > 1) {
                cluster.addSubclusters(subclusters);
            } else if (subclusters.size() == 1) {
                Cluster subcluster = subclusters.get(0);
                clusterLabel = subcluster.getPhrases().get(0);
                cluster.addDocuments(subcluster.getDocuments());
                cluster.addSubclusters(subcluster.getSubclusters());
            } else {
                for (Integer documentIndex : indexes) {
                    cluster.addDocuments(documents[documentIndex]);
                }
            }
            cluster.addPhrases(clusterLabel);
            clusters.add(cluster);
            documentsInClusters.addAll(indexes);
        }
        if (documentsInClusters.isEmpty()) {
            return Lists.newArrayList();
        }
        Collections.sort(clusters, Cluster.BY_REVERSED_SIZE_AND_LABEL_COMPARATOR);
        ArrayList documentsInCluster = Lists.newArrayListWithExpectedSize((int)documentIndexes.size());
        for (Integer documentIndex : documentIndexes) {
            documentsInCluster.add(documents[documentIndex]);
        }
        Cluster.appendOtherTopics(documentsInCluster, clusters, "Other Sites");
        return clusters;
    }

    final String[][] buildUrlParts(Document[] documents) {
        String[][] urlParts = new String[documents.length][];
        for (int i = 0; i < documents.length; ++i) {
            String url = (String)documents[i].getField("url");
            if (url == null) continue;
            int colonSlashSlashIndex = url.indexOf("://");
            if (colonSlashSlashIndex < 0) {
                colonSlashSlashIndex = 0;
            } else {
                if (colonSlashSlashIndex + 3 >= url.length()) continue;
                colonSlashSlashIndex += 3;
            }
            int slashIndex = url.indexOf(47, colonSlashSlashIndex + 3);
            if (slashIndex < 0) {
                slashIndex = url.length();
            }
            String urlMainPart = url.substring(colonSlashSlashIndex, slashIndex).toLowerCase();
            Object[] splitUrl = urlMainPart.split("\\.");
            ArrayUtils.reverse((Object[])splitUrl);
            urlParts[i] = splitUrl;
        }
        return urlParts;
    }

    static {
        STOP_URL_PARTS.add("www");
    }
}

