1 2 /************************************************************************** 3 * FPorterStemmer.d - 29-Mai-2007 4 * - Filter for perferming word stemming based on the Porter algorithm 5 * 6 * Copyright (c) 2007 Daniel Truemper (daniel-truemper at gmx.de) 7 * 8 **************************************************************************/ 9 10 /** 11 * Note: 12 * 13 * This implementation was converted from the python implementation 14 * made by Vivake Gupta (v@nano.com) so all credit belongs to him! 15 * 16 * ---- original comment 17 * 18 * This is the Porter stemming algorithm, ported to Python from the 19 * version coded up in ANSI C by the author. It may be be regarded 20 * as canonical, in that it follows the algorithm presented in 21 * 22 * Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, 23 * no. 3, pp 130-137, 24 * 25 * only differing from it at the points maked --DEPARTURE-- below. 26 * 27 * See also http://www.tartarus.org/~martin/PorterStemmer 28 * 29 * The algorithm as described in the paper could be exactly replicated 30 * by adjusting the points of DEPARTURE, but this is barely necessary, 31 * because (a) the points of DEPARTURE are definitely improvements, and 32 * (b) no encoding of the Porter stemmer I have seen is anything like 33 * as exact as this version, even with the points of DEPARTURE! 34 * 35 * Vivake Gupta (v@nano.com) 36 * 37 * Release 1: January 2001 38 * 39 * ---- end of original comment 40 * 41 */ 42 module PorterStemmer; 43 44 public struct PorterStemmer 45 { 46 private 47 { 48 const(char)[] m_b; // buffer for the word 49 int m_k = 0; 50 int m_k0 = 0; 51 int m_j = 0; // offset within the string 52 } 53 54 /** 55 * cons returns true, if b[i] is a consonant 56 */ 57 private bool cons( int i ) 58 { 59 if( m_b[i] == 'a' || m_b[i] == 'e' || m_b[i] == 'i' || m_b[i] == 'o' || m_b[i] == 'u' ) 60 return false; 61 62 if( m_b[i] == 'y' ) 63 { 64 if( i == m_k0 ) 65 { 66 return true; 67 } else 68 { 69 return !cons( i - 1 ); 70 } 71 } 72 return true; 73 } 74 75 /** 76 * measures the number of consonant sequences between k0 and j. 77 * if c is a consonant sequence and v a vowel sequence, and <..> 78 * indicates arbitrary presence, 79 * 80 * <c><v> gives 0 81 * <c>vc<v> gives 1 82 * <c>vcvc<v> gives 2 83 * <c>vcvcvc<v> gives 3 84 * 85 */ 86 private int m() 87 { 88 int n = 0; 89 int i = m_k0; 90 91 while( true ) 92 { 93 if( i > m_j ) 94 { 95 return n; 96 } 97 if( !cons( i ) ) 98 { 99 break; 100 } 101 i++; 102 } 103 i++; 104 while( true ) 105 { 106 while( true ) 107 { 108 if( i > m_j ) 109 { 110 return n; 111 } 112 if( cons( i ) ) 113 { 114 break; 115 } 116 i++; 117 } 118 i++; 119 n++; 120 while( true ) 121 { 122 if( i > m_j ) 123 { 124 return n; 125 } 126 if( !cons( i ) ) 127 { 128 break; 129 } 130 i++; 131 } 132 i++; 133 } 134 } 135 136 /** 137 * returns true if k0...j contains a vowel 138 */ 139 private bool vowelinstem() 140 { 141 for( int i = m_k0; i < m_j + 1; i++ ) 142 { 143 if( !cons( i ) ) 144 return true; 145 } 146 return false; 147 } 148 149 /** 150 * returns true if j, j-1 contains a double consonant 151 */ 152 private bool doublec( int j ) 153 { 154 if( j < ( m_k0 + 1 ) ) 155 return false; 156 if( m_b[j] != m_b[j-1] ) 157 return false; 158 return cons( j ); 159 } 160 161 /** 162 * is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant 163 * and also if the second c is not w,x or y. this is used when trying to 164 * restore an e at the end of a short e.g. 165 * 166 * cav(e), lov(e), hop(e), crim(e), but 167 * snow, box, tray. 168 * 169 */ 170 private bool cvc( int i ) 171 { 172 if( i < ( m_k0 + 2 ) || !cons( i ) || cons( i-1 ) || !cons( i-2 ) ) 173 return false; 174 if( m_b[i] == 'w' || m_b[i] == 'x' || m_b[i] == 'y' ) 175 return false; 176 return true; 177 } 178 179 /** 180 * ends(s) is TRUE <=> k0,...k ends with the string s. 181 */ 182 private bool ends( const char[] s ) 183 { 184 int len = cast(int) s.length; 185 186 if( s[ len - 1 ] != m_b[ m_k ] ) 187 return false; 188 if( len > ( m_k - m_k0 + 1 ) ) 189 return false; 190 191 int a = m_k - len + 1; 192 int b = m_k + 1; 193 194 if( m_b[a..b] != s ) 195 { 196 return false; 197 } 198 m_j = m_k - len; 199 200 return true; 201 } 202 203 /** 204 * setto(s) sets (j+1),...k to the characters in the string s, readjusting k. 205 */ 206 private void setto( const char[] s ) 207 { 208 m_b = m_b[0..m_j+1] ~ s ~ m_b[ m_j + s.length + 1 .. m_b.length ]; 209 m_k = m_j + cast(int) s.length; 210 } 211 212 /** 213 * used further down 214 */ 215 private void r( const char[] s ) 216 { 217 if( m() > 0 ) 218 setto( s ); 219 } 220 221 /** 222 * step1ab() gets rid of plurals and -ed or -ing. e.g. 223 * 224 * caresses -> caress 225 * ponies -> poni 226 * ties -> ti 227 * caress -> caress 228 * cats -> cat 229 * 230 * feed -> feed 231 * agreed -> agree 232 * disabled -> disable 233 * 234 * matting -> mat 235 * mating -> mate 236 * meeting -> meet 237 * milling -> mill 238 * messing -> mess 239 * 240 * meetings -> meet 241 * 242 */ 243 private void step1ab() 244 { 245 if( m_b[ m_k ] == 's' ) 246 { 247 if( ends( "sses" ) ) 248 m_k = m_k - 2; 249 else if( ends( "ies" ) ) 250 setto( "i" ); 251 else if( m_b[ m_k - 1 ] != 's' ) 252 m_k--; 253 } 254 if( ends( "eed" ) ) 255 { 256 if( m() > 0 ) 257 m_k--; 258 } else if( ( ends( "ed" ) || ends( "ing" ) ) && vowelinstem() ) 259 { 260 m_k = m_j; 261 if( ends( "at" ) ) 262 { 263 setto( "ate" ); 264 } else if( ends( "bl" ) ) 265 { 266 setto( "ble" ); 267 } else if( ends( "iz" ) ) 268 { 269 setto( "ize" ); 270 } else if( doublec( m_k ) ) 271 { 272 m_k--; 273 if( m_b[ m_k ] == 'l' || m_b[ m_k ] == 's' || m_b[ m_k ] == 'z' ) 274 m_k++; 275 } else if( m() == 1 && cvc( m_k ) ) 276 { 277 setto( "e" ); 278 } 279 } 280 } 281 282 /** 283 * step1c() turns terminal y to i when there is another vowel in the stem. 284 */ 285 private void step1c() 286 { 287 if( ends( "y" ) && vowelinstem() ) 288 { 289 m_b = m_b[0..m_k] ~ 'i' ~ m_b[ m_k+1 .. m_b.length ]; 290 } 291 } 292 293 /** 294 * step2() maps double suffices to single ones. 295 * so -ization ( = -ize plus -ation) maps to -ize etc. note that the 296 * string before the suffix must give m() > 0.* 297 */ 298 private void step2() 299 { 300 if( m_b[ m_k - 1 ] == 'a' ) 301 { 302 if( ends( "ational" ) ) 303 r( "ate" ); 304 else if( ends( "tional" ) ) 305 r( "tion" ); 306 } else if( m_b[ m_k - 1 ] == 'c' ) 307 { 308 if( ends( "enci" ) ) 309 r( "ence" ); 310 else if( ends( "anci" ) ) 311 r( "ance" ); 312 } else if( m_b[ m_k - 1 ] == 'e' ) 313 { 314 if( ends( "izer" ) ) 315 r( "ize" ); 316 } else if( m_b[ m_k - 1 ] == 'l' ) 317 { 318 if( ends( "bli" ) ) 319 r( "ble" ); 320 /* --DEPARTURE-- 321 * To match the published algorithm, replace this phrase with 322 * if( ends( "abli" ) ) 323 * r( "able" ); 324 */ 325 else if( ends( "alli" ) ) 326 r( "al" ); 327 else if( ends( "entli" ) ) 328 r( "ent" ); 329 else if( ends( "eli" ) ) 330 r( "e" ); 331 else if( ends( "ousli" ) ) 332 r( "ous" ); 333 } else if( m_b[ m_k - 1 ] == 'o' ) 334 { 335 if( ends( "ization" ) ) 336 r( "ize" ); 337 else if( ends( "ation" ) || ends( "ator" ) ) 338 r( "ate" ); 339 } else if( m_b[ m_k - 1 ] == 's' ) 340 { 341 if( ends( "alism" ) ) 342 r( "al" ); 343 else if( ends( "iveness" ) ) 344 r( "ive" ); 345 else if( ends( "fulness" ) ) 346 r( "ful" ); 347 else if( ends( "ousness" ) ) 348 r( "ous" ); 349 } else if( m_b[ m_k - 1 ] == 't' ) 350 { 351 if( ends( "aliti" ) ) 352 r( "al" ); 353 else if( ends( "iviti" ) ) 354 r( "ive" ); 355 else if( ends( "biliti" ) ) 356 r( "ble" ); 357 } else if( m_b[ m_k - 1 ] == 'g' ) 358 { 359 /** 360 * --DEPARTURE-- 361 * To match the published algorithm, delete this phrase 362 */ 363 if( ends( "logi" ) ) 364 r( "log" ); 365 } 366 367 } 368 369 /** 370 * step3() dels with -ic-, -full, -ness etc. similar strategy to step2. 371 */ 372 private void step3() 373 { 374 if( m_b[ m_k ] == 'e' ) 375 { 376 if( ends( "icate" ) ) 377 r( "ic" ); 378 else if( ends( "ative" ) ) 379 r( "" ); 380 else if( ends( "alize" ) ) 381 r( "al" ); 382 }else if( m_b[ m_k ] == 'i' ) 383 { 384 if( ends( "iciti" ) ) 385 r( "ic" ); 386 }else if( m_b[ m_k ] == 'l' ) 387 { 388 if( ends( "ical" ) ) 389 r( "ic" ); 390 else if( ends( "ful" ) ) 391 r( "" ); 392 }else if( m_b[ m_k ] == 's' ) 393 { 394 if( ends( "ness" ) ) 395 r( "" ); 396 } 397 } 398 399 /** 400 * step4() takes off -ant, -ence etc., in context <c>vcvc<v>. 401 */ 402 private void step4() 403 { 404 405 /* fixes bug 1 */ 406 if( m_k == 0 ) 407 return; 408 switch( m_b[ m_k - 1 ] ) 409 { 410 411 case 'a': 412 if( ends( "al" ) ) 413 break; 414 return; 415 416 case 'c': 417 if( ends( "ance" ) || ends( "ence" ) ) 418 break; 419 return; 420 421 case 'e': 422 if( ends( "er" ) ) 423 break; 424 return; 425 426 case 'i': 427 if( ends( "ic" ) ) 428 break; 429 return; 430 431 case 'l': 432 if( ends( "able" ) || ends( "ible" ) ) 433 break; 434 return; 435 436 case 'n': 437 if( ends( "ant" ) || ends( "ement" ) || ends( "ment" ) || ends( "ent" ) ) 438 break; 439 return; 440 441 case 'o': 442 if( ends( "ion" ) && m_j >= 0 && ( m_b[ m_j ] == 's' || m_b[ m_j ] == 't' ) ) 443 { 444 /* m_j >= 0 fixes bug 2 */ 445 break; 446 } 447 if( ends( "ou" ) ) 448 break; 449 return; 450 451 case 's': 452 if( ends( "ism" ) ) 453 break; 454 return; 455 456 case 't': 457 if( ends( "ate" ) || ends( "iti" ) ) 458 break; 459 return; 460 461 case 'u': 462 if( ends( "ous" ) ) 463 break; 464 return; 465 466 case 'v': 467 if( ends( "ive" ) ) 468 break; 469 return; 470 471 case 'z': 472 if( ends( "ize" ) ) 473 break; 474 return; 475 476 default: 477 return; 478 479 } 480 481 if( m() > 1 ) 482 m_k = m_j; 483 484 } 485 486 /** 487 * step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1. 488 */ 489 private void step5() 490 { 491 m_j = m_k; 492 if( m_b[ m_k ] == 'e' ) 493 { 494 auto a = m(); 495 if( a > 1 || (a == 1 && !cvc( m_k - 1 ) ) ) 496 m_k--; 497 } 498 if( m_b[ m_k ] == 'l' && doublec( m_k ) && m() > 1 ) 499 m_k--; 500 501 } 502 503 /** 504 * In stem(p,i,j), p is a char pointer, and the string to be stemmed 505 * is from p[i] to p[j] inclusive. Typically i is zero and j is the 506 * offset to the last character of a string, (p[j+1] == '\0'). The 507 * stemmer adjusts the characters p[i] ... p[j] and returns the new 508 * end-point of the string, k. Stemming never increases word length, so 509 * i <= k <= j. To turn the stemmer into a module, declare 'stem' as 510 * extern, and delete the remainder of this file. 511 */ 512 public const(char)[] stem(const char[] p) 513 { 514 try { 515 m_b = p; 516 m_k = cast(int) p.length-1; 517 m_k0 = 0; 518 519 /** 520 * --DEPARTURE-- 521 * 522 * With this line, strings of length 1 or 2 don't go through the 523 * stemming process, although no mention is made of this in the 524 * published algorithm. Remove the line to match the published 525 * algorithm. 526 * 527 */ 528 if( m_k <= m_k0 + 1 ) 529 return m_b; 530 531 step1ab(); 532 step1c(); 533 step2(); 534 step3(); 535 step4(); 536 step5(); 537 return m_b[ m_k0 .. m_k + 1 ]; 538 } catch(Throwable t) { return p; } 539 540 } 541 542 } 543 544