/* oct27.ex1 */ /* apply first approach to fed paper */ options ls=80 ; data a ; keep gword ; * just words ; length line line2 $1000 word $100 ; * lengths of char variables ; length gword $30 ; * good words can be shorter ; infile 'w6y2009' pad lrecl=1000 ; retain go 1 ; * flag for no html ; input line 1-1000 ; line2 = tranwrd(line,'<',' <') ; * space in front of < ; line2 = tranwrd(line2,'>','> ') ; * space in back of > ; num = countw(line2,' ') ; do i = 1 to num ; * start loop *******; word = scan(line2,i,' ') ; lw = length(word) ; if( substr(word,1,1) eq '<' ) then go = 0 ; * leading < ; if( go eq 1 ) then do ; gword = compress(word,'.,-/;:','s') ; lword = lengthn(gword); if(lword > 0 ) then output ; * only good ones ; end ; * ********** loop ; if( substr(word,lw,1) eq '>') then go = 1 ; * trailing > ; end ; run ; /* print out results */ proc print data=a (firstobs=200) ; var gword ; title 'still first approach' ; run ;