Data Searching using Carverphone
by: G.E. Ozz Nixon Jr.
Published: October 2007
©opyright 2009 by Friends of FPC
Data Searching is a very complex art in itself. Ignoring the long history of
Internet "search engines" growing from Archie in 1992 to today's Google engine.
Original techniques to provide users with "full text" search required a solution
designed for common mistakes, one of which is called Carverphone.
Carverphone is not a well published phonetic algorithm, it was on a private
web site for students to see and understand phonetic comparisons. If you get familiar
with the logic, you will places that you can extend this code and make a perfect hit
for our samples - and even our harder to match patterns. All results use a 10 byte
key, useful for binary searching.
Code snippet from DXSock 6.0 WLM (Courtesy of Brain Patchwork DX, LLC)
function Carverphone(Const S: string): string; // another theory (ported from C#)
var
Loop:Integer;
MaxLoop:Integer;
TmpStr:String;
Begin
Result:='';
If (S='') then Exit;
TmpStr:=Lowercase(SpacesOnly(UpperCase(S)));
{$IFDEF ASSEMBLY}
asm // 1090108: By Ozz (FASTER THAN Length()!)
MOV EAX, TmpStr; // Store Str Address
MOV EAX, [EAX-$04]; // Move to "Size" Int32
MOV Loop, EAX; // Put into Result
End;
{$ELSE}
Loop:=Length(TmpStr);
{$ENDIF}
If Copy(TmpStr,Loop,1)='e' then Delete(TmpStr,Loop,1);
if Loop=0 then exit;
TmpStr:=StringReplace(TmpStr,'ugh','u2f',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'gn','2n',[rfReplaceAll]);
{$IFDEF ASSEMBLY}
asm // 1090108: By Ozz (FASTER THAN Length()!)
MOV EAX, TmpStr; // Store Str Address
MOV EAX, [EAX-$04]; // Move to "Size" Int32
MOV Loop, EAX; // Put into Result
End;
{$ELSE}
Loop:=Length(TmpStr);
{$ENDIF}
If Copy(TmpStr,Loop-1,2)='mb' then begin
{$IFDEF ASSEMBLY}
asm // 1090108: By Ozz (FASTER THAN Length()!)
MOV EAX, TmpStr; // Store Str Address
MOV EAX, [EAX-$04]; // Move to "Size" Int32
MOV Loop, EAX; // Put into Result
End;
{$ELSE}
Loop:=Length(TmpStr);
{$ENDIF}
If Copy(TmpStr,Loop-1,2)='mb' then begin
{$IFDEF ASSEMBLY}
asm // 1090108: By Ozz (FASTER THAN Length()!)
MOV EAX, TmpStr; // Store Str Address
MOV EAX, [EAX-$04]; // Move to "Size" Int32
MOV Loop, EAX; // Put into Result
End;
{$ELSE}
Loop:=Length(TmpStr);
{$ENDIF}
Delete(TmpStr,Loop-1,2);
TmpStr:=TmpStr+'2';
end;
TmpStr:=StringReplace(TmpStr,'cq','2q',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'ci','si',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'ce','se',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'cy','sy',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'tch','2ch',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'c','k',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'q','k',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'x','k',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'v','f',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'dg','2g',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'tio','sio',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'tia','sia',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'d','t',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'ph','fh',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'b','p',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'sh','s2',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'z','s',[rfReplaceAll]);
if TmpStr[1] in ['a','e','i','o','u'] then TmpStr[1]:='A';
{$IFDEF ASSEMBLY}
asm // 1090108: By Ozz (FASTER THAN Length()!)
MOV EAX, TmpStr; // Store Str Address
MOV EAX, [EAX-$04]; // Move to "Size" Int32
MOV MaxLoop, EAX; // Put into Result
End;
{$ELSE}
MaxLoop:=Length(TmpStr);
{$ENDIF}
if TmpStr[loop] in ['a','e','i','o','u'] then TmpStr[Loop]:='3';
TmpStr:=StringReplace(TmpStr,'j','y',[rfReplaceAll]);
If Copy(TmpStr,1,2)='y3' then TmpStr[1]:='Y';
If Copy(TmpStr,1,1)='y' then TmpStr[1]:='A';
TmpStr:=StringReplace(TmpStr,'y','3',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'3gh3','3kh3',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'gh','22',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'g','k',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'s','S',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'t','T',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'p','P',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'k','K',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'f','F',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'m','M',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'n','N',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'w3','W3',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'wh3','Wh3',[rfReplaceAll]);
{$IFDEF ASSEMBLY}
asm // 1090108: By Ozz (FASTER THAN Length()!)
MOV EAX, TmpStr; // Store Str Address
MOV EAX, [EAX-$04]; // Move to "Size" Int32
MOV Loop, EAX; // Put into Result
End;
{$ELSE}
Loop:=Length(TmpStr);
{$ENDIF}
If Copy(TmpStr,Loop,1)='w' then TmpStr[Loop]:='3';
TmpStr:=StringReplace(TmpStr,'w','2',[rfReplaceAll]);
If Copy(TmpStr,1,1)='h' then TmpStr[1]:='A';
TmpStr:=StringReplace(TmpStr,'h','2',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'r3','R3',[rfReplaceAll]);
{$IFDEF ASSEMBLY}
asm // 1090108: By Ozz (FASTER THAN Length()!)
MOV EAX, TmpStr; // Store Str Address
MOV EAX, [EAX-$04]; // Move to "Size" Int32
MOV Loop, EAX; // Put into Result
End;
{$ELSE}
Loop:=Length(TmpStr);
{$ENDIF}
If Copy(TmpStr,Loop,1)='r' then TmpStr[Loop]:='3';
TmpStr:=StringReplace(TmpStr,'r','2',[rfReplaceAll]);
TmpStr:=StringReplace(TmpStr,'l3','L3',[rfReplaceAll]);
{$IFDEF ASSEMBLY}
asm // 1090108: By Ozz (FASTER THAN Length()!)
MOV EAX, TmpStr; // Store Str Address
MOV EAX, [EAX-$04]; // Move to "Size" Int32
MOV Loop, EAX; // Put into Result
End;
{$ELSE}
Loop:=Length(TmpStr);
{$ENDIF}
If Copy(TmpStr,Loop,1)='l' then TmpStr[Loop]:='3';
TmpStr:=StringReplace(TmpStr,'l','2',[rfReplaceAll]);
{$IFDEF ASSEMBLY}
asm // 1090108: By Ozz (FASTER THAN Length()!)
MOV EAX, TmpStr; // Store Str Address
MOV EAX, [EAX-$04]; // Move to "Size" Int32
MOV Loop, EAX; // Put into Result
End;
{$ELSE}
Loop:=Length(TmpStr);
{$ENDIF}
If Copy(TmpStr,Loop,1)='3' then TmpStr[Loop]:='A';
TmpStr:=StringReplace(TmpStr,'3','',[rfReplaceAll]);
Result:=Copy(TmpStr+'1111111111',1,10);
End;
Using this algorithm, both "Robert" and "Rupert" return the same string "RP2T111111"
while "Rubin" and "Robin" yields "RPN1111111". "Ashcroft" and "Ashcraft" yields "AS2KRFT111".
However, "George" and "Jorge" yields "K2K1111111" and "Y2K1111111" and "Dave" and "David"
yields "TF11111111" and "TFT1111111".
This simple example shows an improvement for Ashcroft and Ashcraft over Soundex, however,
George and Jorge and Dave and David need an improved theory. Read more on
other comparison techniques I used in my search engines.
G.E. Ozz Nixon Jr.