PorterStemmer source code

1 
2 /**************************************************************************
3  * FPorterStemmer.d - 29-Mai-2007
4  * - Filter for perferming word stemming based on the Porter algorithm
5  *
6  * Copyright (c) 2007 Daniel Truemper (daniel-truemper at gmx.de)
7  *
8  **************************************************************************/
9 
10 /**
11  * Note:
12  *
13  * This implementation was converted from the python implementation
14  * made by Vivake Gupta (v@nano.com) so all credit belongs to him!
15  *
16  * ---- original comment
17  *
18  * This is the Porter stemming algorithm, ported to Python from the
19  * version coded up in ANSI C by the author. It may be be regarded
20  * as canonical, in that it follows the algorithm presented in
21  *
22  * Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
23  * no. 3, pp 130-137,
24  *
25  * only differing from it at the points maked --DEPARTURE-- below.
26  *
27  * See also http://www.tartarus.org/~martin/PorterStemmer
28  *
29  * The algorithm as described in the paper could be exactly replicated
30  * by adjusting the points of DEPARTURE, but this is barely necessary,
31  * because (a) the points of DEPARTURE are definitely improvements, and
32  * (b) no encoding of the Porter stemmer I have seen is anything like
33  * as exact as this version, even with the points of DEPARTURE!
34  *
35  * Vivake Gupta (v@nano.com)
36  *
37  * Release 1: January 2001
38  *
39  * ---- end of original comment
40  *
41  */
42 module PorterStemmer;
43 
44 public struct PorterStemmer
45 {
46 	private
47 	{
48 		const(char)[] m_b;			// buffer for the word
49 		int m_k = 0;
50 		int m_k0 = 0;
51 		int m_j = 0;		// offset within the string
52 	}
53 
54 	/**
55 	 * cons returns true, if b[i] is a consonant
56 	 */
57 	private bool cons( int i )
58 	{
59 		if( m_b[i] == 'a' || m_b[i] == 'e' || m_b[i] == 'i' || m_b[i] == 'o' || m_b[i] == 'u' )
60 			return false;
61 
62 		if( m_b[i] == 'y' )
63 		{
64 			if( i == m_k0 )
65 			{
66 				return true;
67 			} else
68 			{
69 				return !cons( i - 1 );
70 			}
71 		}
72 		return true;
73 	}
74 
75 	/**
76 	 * measures the number of consonant sequences between k0 and j.
77 	 * if c is a consonant sequence and v a vowel sequence, and <..>
78 	 * indicates arbitrary presence,
79 	 *
80 	 * <c><v>       gives 0
81 	 * <c>vc<v>     gives 1
82 	 * <c>vcvc<v>   gives 2
83 	 * <c>vcvcvc<v> gives 3
84 	 *
85 	 */
86 	private int m()
87 	{
88 		int n = 0;
89 		int i = m_k0;
90 		
91 		while( true )
92 		{
93 			if( i > m_j )
94 			{
95 				return n;
96 			}
97 			if( !cons( i ) )
98 			{
99 				break;
100 			}
101 			i++;
102 		}
103 		i++;
104 		while( true )
105 		{
106 			while( true )
107 			{
108 				if( i > m_j )
109 				{
110 					return n;
111 				}
112 				if( cons( i ) )
113 				{
114 					break;
115 				}
116 				i++;
117 			}
118 			i++;
119 			n++;
120 			while( true )
121 			{
122 				if( i > m_j )
123 				{
124 					return n;
125 				}
126 				if( !cons( i ) )
127 				{
128 					break;
129 				}
130 				i++;
131 			}
132 			i++;
133 		}
134 	}
135 
136 	/**
137 	 * returns true if k0...j contains a vowel
138 	 */
139 	private bool vowelinstem()
140 	{
141 		for( int i = m_k0; i < m_j + 1; i++ )
142 		{
143 			if( !cons( i ) )
144 				return true;
145 		}
146 		return false;
147 	}
148 
149 	/**
150 	 * returns true if j, j-1 contains a double consonant
151 	 */
152 	private bool doublec( int j )
153 	{
154 		if( j < ( m_k0 + 1 ) )
155 			return false;
156 		if( m_b[j] != m_b[j-1] )
157 			return false;
158 		return cons( j );
159 	}
160 
161 	/**
162 	 * is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
163 	 * and also if the second c is not w,x or y. this is used when trying to
164 	 * restore an e at the end of a short  e.g.
165 	 *
166 	 *    cav(e), lov(e), hop(e), crim(e), but
167 	 *    snow, box, tray.
168 	 *
169 	 */
170 	private bool cvc( int i )
171 	{
172 		if( i < ( m_k0 + 2 ) || !cons( i ) || cons( i-1 ) || !cons( i-2 ) )
173 			return false;
174 		if( m_b[i] == 'w' || m_b[i] == 'x' || m_b[i] == 'y' )
175 			return false;
176 		return true;
177 	}
178 
179 	/**
180 	 * ends(s) is TRUE <=> k0,...k ends with the string s.
181 	 */
182 	private bool ends( const char[] s )
183 	{
184 		int len = cast(int) s.length;
185 
186 		if( s[ len - 1 ] != m_b[ m_k ] )
187 			return false;
188 		if( len > ( m_k - m_k0 + 1 ) )
189 			return false;
190 
191 		int a = m_k - len + 1;
192 		int b = m_k + 1;
193 
194 		if( m_b[a..b] != s )
195 		{
196 			return false;
197 		}
198 		m_j = m_k - len;
199 
200 		return true;
201 	}
202 
203 	/**
204 	 * setto(s) sets (j+1),...k to the characters in the string s, readjusting k.
205 	 */
206 	private void setto( const char[] s )
207 	{
208 		m_b = m_b[0..m_j+1] ~ s ~ m_b[ m_j + s.length + 1 .. m_b.length ];
209 		m_k = m_j + cast(int) s.length;
210 	}
211 
212 	/**
213 	 * used further down
214 	 */
215 	private void r( const char[] s )
216 	{
217 		if( m() > 0 )
218 			setto( s );
219 	}
220 
221 	/**
222 	 * step1ab() gets rid of plurals and -ed or -ing. e.g.
223 	 *
224 	 *     caresses  ->  caress
225 	 *     ponies    ->  poni
226 	 *     ties      ->  ti
227 	 *     caress    ->  caress
228 	 *     cats      ->  cat
229 	 *
230 	 *     feed      ->  feed
231 	 *     agreed    ->  agree
232 	 *     disabled  ->  disable
233 	 *
234 	 *     matting   ->  mat
235 	 *     mating    ->  mate
236 	 *     meeting   ->  meet
237 	 *     milling   ->  mill
238 	 *     messing   ->  mess
239 	 *
240 	 *     meetings  ->  meet
241 	 *
242 	 */
243 	private void step1ab()
244 	{
245 		if( m_b[ m_k ] == 's' )
246 		{
247 			if( ends( "sses" ) )
248 				m_k = m_k - 2;
249 			else if( ends( "ies" ) )
250 				setto( "i" );
251 			else if( m_b[ m_k - 1 ] != 's' )
252 				m_k--;
253 		}
254 		if( ends( "eed" ) )
255 		{
256 			if( m() > 0 )
257 				m_k--;
258 		} else if( ( ends( "ed" ) || ends( "ing" ) ) && vowelinstem() )
259 		{
260 			m_k = m_j;
261 			if( ends( "at" ) )
262 			{
263 				setto( "ate" );
264 			} else if( ends( "bl" ) )
265 			{
266 				setto( "ble" );
267 			} else if( ends( "iz" ) )
268 			{
269 				setto( "ize" );
270 			} else if( doublec( m_k ) )
271 			{
272 				m_k--;
273 				if( m_b[ m_k ] == 'l' || m_b[ m_k ] == 's' || m_b[ m_k ] == 'z' )
274 					m_k++;
275 			} else if( m() == 1 && cvc( m_k ) )
276 			{
277 				setto( "e" );
278 			}
279 		}
280 	}
281 
282 	/**
283 	 * step1c() turns terminal y to i when there is another vowel in the stem.
284 	 */
285 	private void step1c()
286 	{
287 		if( ends( "y" ) && vowelinstem() )
288 		{
289 			m_b = m_b[0..m_k] ~ 'i' ~ m_b[ m_k+1 .. m_b.length ];
290 		}
291 	}
292 
293 	/**
294 	 * step2() maps double suffices to single ones.
295      * so -ization ( = -ize plus -ation) maps to -ize etc. note that the
296      * string before the suffix must give m() > 0.*
297 	 */
298 	private void step2()
299 	{
300 		if( m_b[ m_k - 1 ] == 'a' )
301 		{
302 			if( ends( "ational" ) )
303 				r( "ate" );
304 			else if( ends( "tional" ) )
305 				r( "tion" );
306 		} else if( m_b[ m_k - 1 ] == 'c' )
307 		{
308 			if( ends( "enci" ) )
309 				r( "ence" );
310 			else if( ends( "anci" ) )
311 				r( "ance" );
312 		} else if( m_b[ m_k - 1 ] == 'e' )
313 		{
314 			if( ends( "izer" ) )
315 				r( "ize" );
316 		} else if( m_b[ m_k - 1 ] == 'l' )
317 		{
318 			if( ends( "bli" ) )
319 				r( "ble" );
320 			/* --DEPARTURE--
321 			 * To match the published algorithm, replace this phrase with
322 			 * if( ends( "abli" ) )
323 			 *	   r( "able" );
324 			 */
325 			else if( ends( "alli" ) )
326 				r( "al" );
327 			else if( ends( "entli" ) )
328 				r( "ent" );
329 			else if( ends( "eli" ) )
330 				r( "e" );
331 			else if( ends( "ousli" ) )
332 				r( "ous" );
333 		} else if( m_b[ m_k - 1 ] == 'o' )
334 		{
335 			if( ends( "ization" ) )
336 				r( "ize" );
337 			else if( ends( "ation" ) || ends( "ator" ) )
338 				r( "ate" );
339 		} else if( m_b[ m_k - 1 ] == 's' )
340 		{
341 			if( ends( "alism" ) )
342 				r( "al" );
343 			else if( ends( "iveness" ) )
344 				r( "ive" );
345 			else if( ends( "fulness" ) )
346 				r( "ful" );
347 			else if( ends( "ousness" ) )
348 				r( "ous" );
349 		} else if( m_b[ m_k - 1 ] == 't' )
350 		{
351 			if( ends( "aliti" ) )
352 				r( "al" );
353 			else if( ends( "iviti" ) )
354 				r( "ive" );
355 			else if( ends( "biliti" ) )
356 				r( "ble" );
357 		} else if( m_b[ m_k - 1 ] == 'g' )
358 		{
359 			/**
360 			 * --DEPARTURE--
361 			 * To match the published algorithm, delete this phrase
362 			 */
363 			if( ends( "logi" ) )
364 				r( "log" );
365 		}
366 
367 	}
368 
369 	/**
370 	 * step3() dels with -ic-, -full, -ness etc. similar strategy to step2.
371 	 */
372 	private void step3()
373 	{
374 		if( m_b[ m_k ] == 'e' )
375 		{
376 			if( ends( "icate" ) )
377 				r( "ic" );
378 			else if( ends( "ative" ) )
379 				r( "" );
380 			else if( ends( "alize" ) )
381 				r( "al" );
382 		}else if( m_b[ m_k ] == 'i' )
383 		{
384 			if( ends( "iciti" ) )
385 				r( "ic" );
386 		}else if( m_b[ m_k ] == 'l' )
387 		{
388 			if( ends( "ical" ) )
389 				r( "ic" );
390 			else if( ends( "ful" ) )
391 				r( "" );
392 		}else if( m_b[ m_k ] == 's' )
393 		{
394 			if( ends( "ness" ) )
395 				r( "" );
396 		}
397 	}
398 
399 	/**
400 	 * step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
401 	 */
402 	private void step4()
403 	{
404 
405 		/* fixes bug 1 */
406 		if( m_k == 0 )
407 			return;
408 		switch( m_b[ m_k - 1 ] )
409 		{
410 
411 			case 'a':
412 				if( ends( "al" ) )
413 					break;
414 				return;
415 
416 			case 'c':
417 				if( ends( "ance" ) || ends( "ence" ) )
418 					break;
419 				return;
420 
421 			case 'e':
422 				if( ends( "er" ) )
423 					break;
424 				return;
425 
426 			case 'i':
427 				if( ends( "ic" ) )
428 					break;
429 				return;
430 				
431 			case 'l':
432 				if( ends( "able" ) || ends( "ible" ) )
433 					break;
434 				return;
435 
436 			case 'n':
437 				if( ends( "ant" ) || ends( "ement" ) || ends( "ment" ) || ends( "ent" ) )
438 					break;
439 				return;
440 
441 			case 'o':
442 				if( ends( "ion" ) && m_j >= 0 && ( m_b[ m_j ] == 's' || m_b[ m_j ] == 't' ) )
443 				{
444 								  /* m_j >= 0 fixes bug 2 */
445 					break;
446 				}
447 				if( ends( "ou" ) )
448 					break;
449 				return;
450 
451 			case 's':
452 				if( ends( "ism" ) )
453 					break;
454 				return;
455 
456 			case 't':
457 				if( ends( "ate" ) || ends( "iti" ) )
458 					break;
459 				return;
460 
461 			case 'u':
462 				if( ends( "ous" ) )
463 					break;
464 				return;
465 			
466 			case 'v':
467 				if( ends( "ive" ) )
468 					break;
469 				return;
470 
471 			case 'z':
472 				if( ends( "ize" ) )
473 					break;
474 				return;
475 
476 			default:
477 				return;
478 
479 		}
480 
481 		if( m() > 1 )
482 			m_k = m_j;
483 
484 	}
485 
486 	/**
487 	 * step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1.
488 	 */
489 	private void step5()
490 	{
491 		m_j = m_k;
492 		if( m_b[ m_k ] == 'e' )
493 		{
494 			auto a = m();
495 			if( a > 1 || (a == 1 && !cvc( m_k - 1 ) ) )
496 				m_k--;
497 		}
498 		if( m_b[ m_k ] == 'l' && doublec( m_k ) && m() > 1 )
499 			m_k--;
500 		
501 	}
502 
503 	/**
504 	 * In stem(p,i,j), p is a char pointer, and the string to be stemmed
505      * is from p[i] to p[j] inclusive. Typically i is zero and j is the
506      * offset to the last character of a string, (p[j+1] == '\0'). The
507      * stemmer adjusts the characters p[i] ... p[j] and returns the new
508      * end-point of the string, k. Stemming never increases word length, so
509      * i <= k <= j. To turn the stemmer into a module, declare 'stem' as
510      * extern, and delete the remainder of this file.
511 	 */
512 	public const(char)[] stem(const char[] p)
513 	{
514         try {
515 		m_b = p;
516 		m_k = cast(int) p.length-1;
517 		m_k0 = 0;
518 		
519 		/**
520 		 * --DEPARTURE--
521 		 *
522 		 * With this line, strings of length 1 or 2 don't go through the
523 		 * stemming process, although no mention is made of this in the
524 		 * published algorithm. Remove the line to match the published
525 		 * algorithm.
526 		 *
527 		 */
528 		if( m_k <= m_k0 + 1 )
529 			return m_b;
530 			
531 		step1ab();
532 		step1c();
533 		step2();
534 		step3();
535 		step4();
536 		step5();
537 		return m_b[ m_k0 .. m_k + 1 ];
538             } catch(Throwable t) { return p; }
539 		
540 	}
541 
542 }
543 
544