page	55,132
title	TK - Token Parsing filter
;
;	TK --- A Simple Token Parsing Filter for DOS 2.0
;
;	(c) Copyright 1984 by	Jim Mott
;				3710 Slopeview Drive
;				Sunnyvale, CA 95148
;				(408) 274-2620
;	All rights reserved. Permission granted to use this software for
;	personal, noncommercial purposes only.
;
;
;	This program is designed to be a filter for DOS 2.0.
;	It will tokenize its input and allow subsetting and/or
;	single token per line output.
;
;	The format of the command is:
;
;	TK {/RJx | /LJx} {/0} {{/v} | {/v/v}}
;	  where /RJx means right justify all tokens to x positions
;		/LJx means left justify all tokens to x positions
;		     In the two entries x must be in [1..15]
;		/0   means output one token per line
;		/v   means select token v for output. You may select any
;		     number, up to 255, of tokens to output. Repeats are
;		     allowed and you may change the order of the input tokens
;		     on the output line.
;
;	For example, to extract the list of users from a VM directory file and
;	write a sorted list of them without passwords to the printer the
;	following command line would be used.
;
;	FIND "USER " < DIRECT.VM | TK/LJ8/2/4/5/6/7/8/9 | SORT > PRN
;
;
;	For example, to find a list of all sub-directories of the current
;	directory sorted by sub-directory name we would use the following
;	command line:
;
;	DIR | FIND "<DIR>" | TK/LJ8/1/3/4 | SORT | MORE
;
;
;	For example, to generate a sorted list of all words used in a document
;	with one word per line we could use the following command line:
;
;	TK/RJ8/0 < FOOBAR.DOC | SORT | MORE
;
;
;
;
;
;
;
;
;
;
;
;
stak   segment para    stack   'STACK'
       db  8 dup('Jim Mott  (408) 274-2620')
stak   ends
;
;
dsect	segment para	'DATA'
buffer	db	255 dup('?')		; where to put the data
	db	' '			; be sure to end scan correctly
;
glen	dw	0			; length of gbuff
gbptr	dw	gbuff			; point to start of buffer
gbuff	db	255 dup('G')		; buffer used by bufget
;
flag1	db	?			; options enabled
f1rj	equ	01h			; right justify tokens
f1lj	equ	02h			; left justify tokens
f1one	equ	04h			; output one token per line
f1sub	equ	08h			; substring function requested
f1work	equ	10h			; fill trailing spaces
f1oerr	equ	20h			; error in options string
f1eof	equ	40h			; end of file on standard input device
f1qeof	equ	80h			; queue the end of file
;
spaces	db	?			; number of trailing spaces required
;
toksiz	db	?			; token size if (f1rj or f1lj)
;
three	db	3			; length of each entry
tokcnt	db	?			; count of tokens in table
toktbl	db	3*255 dup('0')		; table of token pointers and lengths
;
outcnt	db	?			; number of subsetting entries in
					; outers
outers	db	255 dup('1')		; list of token numbers to output
;
tokptr	dw	?			; pointer to free token space
tokens	db	900 dup('2')		; string space of tokens
;
msgver	db	'TK: Incorrect DOS version. Must be at least 2.00.'
	db	0dh,0ah,'$'
optmsg	db	'TK: Incorrect parameters given.'
	db	0dh,0ah
optmgl	equ	$ - optmsg
noroom	db	'TK: No room for user on device.'
	db	0dh,0ah
lnoroom	equ	$ - noroom
chrspa	db	' '			; a space to output
chrclf	db	0dh,0ah			; <cr><lf> sequence
dsect	ends
;
;
csect	segment para	'CODE'
    assume cs:csect,ds:dsect,ss:stak
;
main	proc	far
;
	push	ds			; set up a return address
	sub	ax,ax			; we want to return to DS:0000
	push	ax
	mov	ax,dsect		; point to start of data area
	mov	ds,ax			; make assume and reality agree
	mov	ah,30h			; get DOS version number
	int	21h			; call OS to get it
	cmp	al,2			; is it at least 2.00?
	jnl	main00			; yesy - good enough
	lea	dx,msgver		; no - point to the "Bad DOS version"
	mov	ah,9			; message and use DOS 1.?? function
	int	21h			; call to print it.
	ret				; and do a long return
;
main00:	call	options			; parse the options (at ES:80) and set
					; flags
	mov	ax,ds			; make ES and DS the same now
	mov	es,ax			; so the string moves work nicely.
	test	flag1,f1oerr		; was there and error in the options
	jz	main01			; no - then go with this baby
	lea	dx,optmsg		; yes - point to options error message
	mov	cx,optmgl		; get length of message
	mov	bx,2			; error output device handle
	mov	ah,40h			; set DOS function number for
	int	21h			; "write to file or device" & call DOS
	jmp	short	main03		; and return as done
;
main01:	mov	tokcnt,0		; no tokens in the table
	lea	ax,tokens		; point to start of token work area
	mov	tokptr,ax		; save pointer to next free byte
	call	bufget			; read in a buffer
	test	flag1,f1eof		; is there any data in the read buffer
	jnz	main03			; no - we are done with this pgm then
	dec	cx			; yes - ignore the trailing <cr>
	jle	main01			; if length is =<0 just get next line
	lea	bx,buffer		; point to the first byte of the data
;
main02:	call	nextok			; get the next token
	or	cx,cx			; are we done with this line yet
	jnz	main02			; no - get yet another token
	call	write			; write the lines
	jmp	short	main01		; and loop for the next line
;
main03:	call	crlf			; write a final <cr><lf> sequence
	mov	al,0			; put 0 in al - return code to post
	mov	ah,4ch			; terminate a process code
	int	21h			; end this program
;
main	endp
;
;
;	OPTIONS - This subroutine will parse the options passed to the
;		  program and set the required bits in flag1. No registers
;		  are preserved since we are called only once, before the
;		  program has really started.
;
options	proc	near
;
	mov	outcnt,0		; initialize outers count
	mov	si,81h			; point to the first parms character
;
opt01:	mov	al,byte ptr es:0[si]	; get a byte from the parm string
	inc	si			; point to the next byte
	cmp	al,0dh			; is it the end of the string?
	jne	opt02			; no - goody, more data to process
	ret				; yes, return to the caller then
;
opt02:	cmp	al,' '			; allow spaces anywhere before slashes
	je	opt01			; ignore them though
	cmp	al,'/'			; we have to start with a slash now
	je	opt04			; if it is a slash then process it
;
opterr:	or	flag1,f1oerr		; otherwise set the options error flag
	ret				; and return
;
opt04:	mov	al,byte ptr es:0[si]	; get the next character after slash
	inc	si			; point to next character in parms
	cmp	al,'a'			; is it lower case or funny?
	jl	opt4a			; no - process it normally then
	sub	al,'a'-'A'		; yes - map lower case to upper
;
opt4a:	cmp	al,'L'			; might it be left justify or numeric
	jl	optnum			; perhaps numeric - check it out
	jne	opt05			; it is not LJ for sure
	or	flag1,f1lj		; assume it is LJ for the moment
	test	flag1,f1rj		; make sure this isn't a duplicate
	jnz	opterr			; if RJ already then big problems
	jmp	short	opt06		; and rejoin common justify code
;
opt05:	cmp	al,'R'			; might it be right justify (RJ)?
	jne	opterr			; no - then it is an error
	or	flag1,f1rj		; yes - assume for the moment it is
	test	flag1,f1lj		; make sure we aren't trying to left
	jnz	opterr			; justify too - if we are we are in
					; deep s..t
opt06:	mov	al,byte ptr es:0[si]	; get the next character
	inc	si			; point to the next character in parms
	cmp	al,'J'			; is it the J we expect?
	je	opt6a			; yes - process it normally then
	cmp	al,'j'			; is it a lower case J
	jne	opterr			; no - that's too bad.
;
opt6a:	mov	al,byte ptr es:0[si]	; get the first byte of the number
	inc	si			; point to next character in parms
	call	decbin			; is it a number?
	jc	opterr			; no - then we have an error
	or	al,al			; is the field size 0?
	je	opterr			; yes - it is in error then
	cmp	al,15			; is field size more than 15?
	jg	opterr			; yes - it is in error then
	mov	toksiz,al		; save the justified field size
	jmp	short opt01		; and process further options
;
optnum:	call	decbin			; is it a number after slash?
	jc	opterr			; no - then it is an error
	or	al,al			; zero is special
	jne	opt08			; not zero - save it in array then
 	or	flag1,f1one		; zero means one token per line
	jmp	short	opt01		; process some other token then
;
opt08:	sub	cx,cx			; get a zeroed double register
	mov	cl,outcnt		; get offset into outers for this guy
	lea	bx,outers		; point just before list of outers
	add	bx,cx			; bx points to origin 1 save spot
	mov	byte ptr [bx],al	; save the token position to write
	inc	outcnt			; add one to outcnt
	or	flag1,f1sub		; make sure substitute flag is on
	jmp	opt01			; and play it again Sam.
;
options	endp
;
;
;	nextok - This subroutine will find the next token in the string
;		 pointed to by bx, with length contained in cx, and move
;		 it to the end of the token space. An entry in toktbl will
;		 be created for this token. When the subroutine returns cx
;		 will be zero if the source data string is empty. bx will
;		 point to the first character past the last token.
;
nextok	proc	near
;
	mov	di,tokptr		; get pointer to where to put token
;
next01:	mov	al,byte ptr 0[bx]	; loop past junk
	cmp	al,' '			; is it a leading space?
	jne	next03			; no - then we have a token
	inc	bx			; yes - point to the next character
	loop	next01			; and try that one
	ret				; return if we are done with output
;
next03:	mov	si,bx			; save pointer to start of token
	mov	ah,1			; initial guess for token length is 1
;
next04:	inc	bx			; point to the next character in input
	mov	al,byte ptr [bx]	; get the character
	cmp	al,' '			; is it the end of the token?
	je	next05			; yes - we have some good numbers
	inc	ah			; no - increment count of contiguous
					; characters.
	loop	next04			; continue till out of chars or a
					; field separator
	dec	ah			; shouldn't get here but correct for
					; it anyway
next05:	push	cx			; save number of chars left in source
					; string
	test	flag1,f1rj + f1lj	; do we have a maximum token length?
	jz	next09			; no - just a normal token write then
	cmp	ah,toksiz		; yes - is this token just right?
	je	next09			; it sure is. we will keep it as is
	jl	next06			; if token size < max token size - pad
	mov	ah,toksiz		; otherwise take max token size as own
	jmp	short	next09		; and continue normally
;
next06:	mov	al,toksiz		; get the toekn size we must pad to
	sub	al,ah			; al contains number of spaces needed
	test	flag1,f1lj		; left justify? (pad right with space)
	jz	next07			; no - must pad to the left with space
	mov	spaces,al		; yes - save how many spaces to fill
	or	flag1,f1work		; mark as work to do later on
	jmp	short	next09		; and join mainline code
;
next07:	mov	cl,al			; cx contains number of leading spaces
;
next08:	mov	byte ptr [di],' '	; put a leading space in this token
	inc	di			; point to the next slot
	loop	next08			; and fill in all needed spaces
;
next09:	mov	cl,ah			; cx now contains total number of
					; chars in token
	cld				; make the direction ever upward
	rep	movsb			; move the token to its spot
	test	flag1,f1work		; is it left justified (need spaces)
	jz	next11			; no - we are done with hard part then
	mov	cl,spaces		; get count of spaces needed
;
next10: mov	byte ptr [di],' '	; move in a trailing space
	inc	di			; point to the next slot and
	loop	next10			; cont. till all trailing spaces done
	and	flag1,255-f1work	; reset the work to do bit
;
next11:	mov	dx,tokptr		; get pointer to start of this token
	mov	tokptr,di		; save pointer to next free token byte
	test	flag1,f1lj + f1rj	; do we have fixed length tokens?
	jz	next12			; no - take them as we get them
	mov	ah,toksiz		; yes - set this tokens length
;
next12:	mov	cl,ah			; save length of token
	mov	al,3			; number of bytes per entry
	mul	tokcnt			; ax is now an offset in toktbl
	lea	si,toktbl		; point to start of token table
	add	si,ax			; si points to an entry in toktbl
	mov	byte ptr [si],cl	; move in length of entry
	mov	word ptr 1[si],dx	; save pointer to start of token
	inc	tokcnt			; count one more token
	pop	cx			; cx contains number of source chars
	or	cx,cx			; left. Are we done yet?
	jz	next13			; yes - return
	dec	cx			; no - correct for undercounting by 1
;
next13:	ret				; and return
;
nextok	endp
;
;
;	write - This routine will write the tokens to the standard output
;		device. It is controlled by the settings of flags in flag1.
;
write	proc	near
;
	sub	cx,cx			; get an empty loop counter
	mov	cl,tokcnt		; cl contains total number tokens read
	or	cx,cx			; do we have anything to write out?
	jnz	write1			; yes - then go for it
	ret				; no - we are done before we begin
;
write1:	test	flag1,f1sub		; are we changing their order?
	jnz	write3			; yes - then use different write logic
	sub	dl,dl			; no - just output them all in order
;
write2:	call	tout			; write the sucker
	inc	dl			; point to the next token
	loop	write2			; and go through them all
	jmp	short	write6		; return. A job well done
;
write3:	mov	cl,outcnt		; get the number tokens to write
	lea	bx,outers		; point to the first one to output
;
write4:	mov	dl,byte ptr [bx]	; get a token to write
	cmp	dl,tokcnt		; is it <= max token?
	jg	write5			; no - don't write it then
	dec	dl			; yes - adjust for origin one and
	call	tout			; write this token then
;
write5:	inc	bx			; point to the next token count to
	loop	write4			; write and loop through whole list
;
write6:	test	flag1,f1one		; are we outputting one token/line?
	jnz	write7			; yes - the last <cr><lf> was written
	call	crlf			; no - write a trailing <cr><lf>
;
write7:	ret				; Done. Go home now.
;
write	endp
;
;
;	tout -	This routine will find and write the token from the input
;		line that is in position dl on that line.
;
tout	proc	near
;
	push	bx			; save the registers
	push	cx
	push	dx
	mov	al,3			; number of bytes per toktbl entry
	mul	dl			; get offset into toktbl for token
	lea	bx,toktbl		; point to the start of the table
	add	bx,ax			; point to the correct 3 byte entry
	sub	cx,cx			; zero the counter
	mov	cl,byte ptr [bx]	; get number of chars in this token
	mov	dx,word ptr 1[bx]	; and point to first byte of token
	call	oswrite			; write to standard output device
	test	flag1,f1one		; only one token per line?
	jz	tout02			; no - write a space then
	call	crlf			; yes - write a <cr><lf> sequence
	jmp	short	tout03		; and return
;
tout02: mov	cx,1			; length of space is one
	lea	dx,chrspa		; point to a space
	call	oswrite			; write to standard output device
;
tout03:	pop	dx			; restore the registers
	pop	cx
	pop	bx
	ret				; and return
;
tout	endp
;
;
;	crlf -	Everybody knows what this routine does.
;
crlf	proc	near
;
	push	ax			; save the registers
	push	bx
	push	cx
	push	dx
	mov	cx,2			; length of <cr><lf> string
	lea	dx,chrclf		; point to the data to write
	call	oswrite			; write to the standard output device
	pop	dx			; restore the registers
	pop	cx
	pop	bx
	pop	ax
	ret				; and return
;
crlf	endp
;
;
;	oswrite - This routine will write characters pointed to by ds:dx
;		  of length contained in cx, to the standard output device
;		  I any errors are detected a message will be written to the
;		  standard error device and flag f1eof will be set.
;
oswrite	proc	near
;
	mov	bx,1			; file handle of standard output
	mov	ah,40h			; write to file or device DOS function
	int	21h			; call DOS
	jc	oswr01			; if error 5 or 6 then end
	cmp	cx,ax			; as many chars as we wanted written?
	je	oswr99			; return if all went well
;
oswr01:	lea	dx,noroom		; point to the "no space" message
	mov	cx,lnoroom		; get the length of the message
	mov	bx,2			; get handle for standard error device
	mov	ah,40h			; write to file or device DOS function
	int	21h			; let him know we erred
	or	flag1,f1eof		; pretend eof on input device so
;
oswr99:	ret				; program stops and return
;
oswrite	endp
;
;
;	decbin -  On entry this routine has the first character to convert
;		  to binary in al. si points to additional characters. On exit
;		  si points to the first non-numeric character found.
;		  al contains the binary value and carry isn't set. If carry
;		  is set on return then an invalid number was found.
;
decbin	proc	near
;
	push	bx			; save a register
	call	decb04			; check for numeric in al
	jnc	decb02			; if al was numeric it is now 0 .. 9
;
decb01:	stc				; make sure carry flag set
	pop	bx			; restore the register
	ret				; and return indicating an error
;
decb02:	mov	bl,al			; get total so far
	mov	al,byte ptr es:0[si]	; get a byte from the input stream
	inc	si
	call	decb04			; check it for numeric
	jnc	decb03			; if numeric then juggle some
	mov	al,bl			; otherwise get the value to return
	clc				; clear carry flag to say it worked
	dec	si			; make sure next char is non-numeric
	pop	bx			; restore the register
	ret				; and return
;
decb03:	mov	bh,al			; save the number for a minute
	mov	al,10			; get the base
	mul	bl			; shift left one position (base al)
	mov	bl,bh			; make bx a good number
	sub	bh,bh			; bx now contains16 bit value of digit
	add	ax,bx			; add in the latest digit
	or	ah,ah			; make sure no overflow
	jne	decb01			; if there was this is an error
	jmp	short	decb02		; continue on our way
;
decb04:	sub	al,'0'			; is it less than a number?
	jl	decb05			; yes - return with carry set
	cmp	al,9			; is it more than a number?
	jg	decb05			; yes - return with carry set
	clc				; no - make sure carry is off
	ret				; then return the number
;
decb05: stc				; set carry on
	ret				; and return
;
decbin	endp
;
;
;	bufget -  This routine will read one 'line' from the standard input
;		  device to buffer. On exit cx contains the count of chars
;		  read. f1eof is set if an end of file condition is
;		  encountered.
;
bufget	proc	near
;
	push	ax			; save the registers
	push	bx
	push	dx
	push	di
	push	si
	test	flag1,f1qeof		; should we reflect an immediate eof?
	jz	bufg00			; no - standard logic here then
	or	flag1,f1eof		; yes - set the end of file bit
	and	flag1,255 - f1qeof	; and say it is no longer pending
	jmp	short	bufret		; return now
;
bufg00:	sub	cx,cx			; count of characters gotten
	lea	di,buffer		; point destination to buffer
;
bufg01:	call	cget			; get one character
	test	flag1,f1eof		; did we get an eof on that try?
	jz	bufg02			; yes - let's hope it is an error
	or	cl,cl			; is there anything in the buffer?
	jz	bufret			; no - just return with cx=0 and f1eof
	mov	ah,0dh			; yes - slap a <cr> on the end
	call	cput			; put it at end of buffer
	and	flag1,255 - f1eof	; clear the end of file bit
	or	flag1,f1qeof		; say next time turn on eof for sure
	jmp	short	bufret		; and return this last buffer
;
bufg02:	cmp	ah,0dh			; is the record terminator character?
	je	bufg03			; yes - don't turn that into a space
	cmp	ah,20h			; no - if not <cr>
	jge	bufg03			; if >= 20h then use as is
	mov	ah,' '			; otherwise make it a space
;
bufg03:	call	cput			; write char to output buffer
	cmp	ah,0dh			; just write the record terminator?
	je	bufret			; yes - then return
	cmp	cl,255			; written 255 characters yet?
	jne	bufg01			; no - get the next character
	mov	byte ptr [di],0dh	; make it a terminator
;
bufret:	pop	si			; restore the registers
	pop	di
	pop	dx
	pop	bx
	pop	ax
	ret				; and return
;
bufget	endp
;
;
;
cget	proc	near
;
	mov	dx,glen			; is there any data in gbuff?
	or	dx,dx			; if count is zero there isn't
	jnz	cget01			; there is data so read it
	push	cx			; save the registers we might need
	push	di
	mov	ah,3fh			; DOS function read from standard in
	mov	bx,0			; file handle for standard in
	mov	cx,255			; number of characters to read
	lea	dx,gbuff		; point to where to put the data
	mov	gbptr,dx		; save pointer to first character
	int	21h			; call DOS function
	mov	glen,ax			; save the number of characters read
	mov	dx,ax			; put data count in dx
	pop	di			; restore the registers
	pop	cx
	or	dx,dx			; did we get data or eof?
	jnz	cget01			; data this time
	or	flag1,f1eof		; set the end of file encountered bit
	ret				; and return
;
cget01:	mov	bx,gbptr		; get pointer to character and return
	mov	ah,byte ptr [bx]
	inc	gbptr			; get character and increment pointer
	dec	glen			; decrement length
	inc	cl			; count this character
	ret				; and return the character
;
cget	endp
;
;
;
;
cput	proc	near
;
	mov	byte ptr [di],ah	; save the character
	inc	di			; point to next spot
	cmp	cl,255			; will we overreach next time
	jne	cput01			; no - good thing
	dec	di			; yes - can't let that happen
;
cput01:	ret				; all done
;
cput	endp
;
csect	ends
	end	main
                                                                                                
