1    /*
2     * Copyright 2008 :torweg free software group
3     * 
4     * This program is free software: you can redistribute it and/or modify
5     * it under the terms of the GNU General Public License as published by
6     * the Free Software Foundation, either version 3 of the License, or
7     * (at your option) any later version.
8     * 
9     * This program is distributed in the hope that it will be useful,
10    * but WITHOUT ANY WARRANTY; without even the implied warranty of
11    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    * GNU General Public License for more details.
13    * 
14    * You should have received a copy of the GNU General Public License
15    * along with this program.  If not, see <http://www.gnu.org/licenses/>.
16    *
17    */
18   package org.torweg.pulse.util.search;
19   
20   import java.io.IOException;
21   import java.util.BitSet;
22   
23   import org.apache.lucene.index.IndexReader;
24   import org.apache.lucene.index.Term;
25   import org.apache.lucene.index.TermDocs;
26   import org.apache.lucene.index.TermEnum;
27   import org.apache.lucene.search.CachingWrapperFilter;
28   import org.apache.lucene.search.DocIdSet;
29   import org.apache.lucene.search.Filter;
30   import org.apache.lucene.util.DocIdBitSet;
31   import org.hibernate.search.annotations.Factory;
32   
33   /**
34    * a {@code Filter} providing unique {@code Content}s during full text
35    * search.
36    * <p>
37    * The actual filtering is heavily influenced by
38    * {@code org.apache.lucene.search.DuplicateFilter} of the
39    * Lucene<sup>TM</sup> contrib package.
40    * </p>
41    * 
42    * @see org.torweg.pulse.site.map.SitemapNode
43    * @author unknown, Thomas Weber
44    * @version $Revision: 1415 $
45    */
46   public class SitemapNodeUniqueContentFilter extends Filter {
47   
48       /**
49        * serialVersionUID.
50        */
51       private static final long serialVersionUID = 6375875782630382349L;
52   
53       /**
54        * default constructor.
55        */
56       public SitemapNodeUniqueContentFilter() {
57           super();
58       }
59   
60       /**
61        * factory method to create a cachable version of
62        * {@code SitemapNodeUniqueContentFilter}.
63        * 
64        * @see Factory
65        * @return the cachable filter
66        */
67       @Factory
68       public final Filter getFilter() {
69           Filter filter = new SitemapNodeUniqueContentFilter();
70           return new CachingWrapperFilter(filter);
71       }
72   
73       /**
74        * does the actual filtering.
75        * <p>
76        * It creates a {@code DocIdBitSet} which is
77        * {@link IndexReader#maxDoc()} wide with bits set for all non duplicates.
78        * </p>
79        * 
80        * @param reader
81        *            the index reader
82        * @return a bit set with the results to be included
83        * @throws IOException
84        *             on errors accessing the index
85        * @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
86        */
87       @Override
88       public final DocIdSet getDocIdSet(final IndexReader reader)
89               throws IOException {
90           BitSet bits = new BitSet(reader.maxDoc()); // assume all are invalid
91           Term uniqueTerm = new Term("contentId", "");
92           TermEnum termEnumeration = reader.terms(uniqueTerm);
93           if (termEnumeration != null) {
94               Term currentTerm = termEnumeration.term();
95               while ((currentTerm != null)
96                       && (currentTerm.field().equals(uniqueTerm.field()))) {
97                   /* mark non duplicates */
98                   TermDocs td = reader.termDocs(currentTerm);
99                   if (td.next()) {
100                      bits.set(td.doc());
101                  }
102                  if (!termEnumeration.next()) {
103                      break;
104                  }
105                  currentTerm = termEnumeration.term();
106              }
107          }
108          return new DocIdBitSet(bits);
109      }
110  }
111