-- package to construct and manipulate suffix arrays -- Authors: Rob Irving, Lorna Love -- Final version: 24 January 2001 -- This version constructs the suffix array by means of -- the standard version of the suffix binary search tree -- represented using dynamically created nodes with Ada.Text_Io; use Ada.Text_Io; with Ada.Integer_Text_Io; use Ada.Integer_Text_Io; with Unchecked_Deallocation; package body Sa_Standard is pragma Suppress(All_Checks); type Node_Type; type Tree_Type is access Node_Type; type Node_Type is record Suffix : Natural; Lcp : Natural; Sub : Dirn_Type; Lchild, Rchild : Tree_Type; end record; procedure Give_Back_Node is new Unchecked_Deallocation(Node_Type, Tree_Type); -------------------------------------------------------------------- function Unused_Char (S : String) return Character is -- returns an ASCII character that does not appear in S, -- giving priority to printable characters; in the case where there -- is no such character, raises the exception No_Dummy_Char_Available Cs : Char_Set_Type; Next : Integer; C : Character; First_Printable : constant := 32; Last_Printable : constant := 127; First_Non_Printable : constant := 0; Last_Non_Printable : constant := 31; begin Cs := Empty_Char_Set; for I in S'range loop Add(S(I), Cs); end loop; Next := First_Printable; while Next <= Last_Printable loop C := Character'Val(Next); if not Is_Member(C, Cs) then return C; else Next := Next + 1; end if; end loop; Next := Last_Non_Printable; while Next >= First_Non_Printable loop C := Character'Val(Next); if not Is_Member(C, Cs) then return C; else Next := Next - 1; end if; end loop; raise No_Dummy_Char_Available; end Unused_Char; -------------------------------------------------------------------- procedure Output_Sa(Sa : in Sa_Type; F_Name : in String) is F : File_Type; begin Create(F, Mode => Out_File, Name => F_Name); for I in Sa'range loop Put(F, Sa(I).Suffix, Width => 3); Put(F, Sa(I).Lcp, Width => 3); if Sa(I).Dir = Left then Put(F, "Left"); else Put(F, "Right"); end if; New_Line(F); end loop; Close(F); end Output_Sa; -------------------------------------------------------------------- function Size (T : in Tree_Type) return Integer is -- returns the number of nodes in tree T Count : Integer := 0; procedure Rec_Traverse(T : in Tree_Type) is begin if T /= null then Rec_Traverse(T.Lchild); Rec_Traverse(T.Rchild); Count := Count + 1; end if; end Rec_Traverse; begin Rec_Traverse(T); return Count; end Size; -------------------------------------------------------------------- procedure Insert_In_Sbst (T : in out Tree_Type; S : in String; J : in Natural) is -- Inserts the suffix S(J..len(S)) into T, the SBST for S -------------------------------------------------------------------- procedure Branch_Left (T : in out Tree_Type; D : in out Dirn_Type) is begin T := T.LChild; D := Left; end Branch_Left; -------------------------------------------------------------------- procedure Branch_Right (T : in out Tree_Type; D : in out Dirn_Type) is begin T := T.RChild; D := Right; end Branch_Right; -------------------------------------------------------------------- K, M, N : Natural; Llcp, Rlcp : Natural := 0; Dir : Dirn_Type; I, Prev_I : Tree_Type; begin I := T; M := S'Length - J + 1; while I /= null loop Prev_I := I; if I.Lcp > Integer'Max(Llcp, Rlcp) then if I.Sub = Left then Branch_Left(I, Dir); else Branch_Right(I, Dir); end if; elsif Llcp > Rlcp then if Llcp > I.Lcp then if I.Sub = Left then Rlcp := I.Lcp; end if; Branch_Right(I, Dir); elsif Llcp = I.Lcp and I.Sub = Right then Branch_Right(I, Dir); end if; elsif Rlcp > Llcp then if Rlcp > I.Lcp then if I.Sub = Right then Llcp := I.Lcp; end if; Branch_Left(I, Dir); elsif Rlcp = I.Lcp and I.Sub = Left then Branch_Left(I, Dir); end if; end if; if I = Prev_I then -- haven't yet branched, so compare characters K := I.Lcp; N := S'Length - I.Suffix + 1; while K < Integer'Min(M, N) and then S(J+K) = S(I.Suffix+K) loop K := K + 1; end loop; if K = M then Llcp := K; Branch_Left(I, Dir); elsif K = N then Rlcp := K; Branch_Right(I, Dir); elsif S(J + K) < S(I.Suffix + K) then Llcp := K; Branch_Left(I, Dir); else Rlcp := K; Branch_Right(I, Dir); end if; end if; end loop; if Dir = Left then if Llcp > Rlcp then Prev_I.Lchild := new Node_Type'(J, Llcp, Left, null, null); else Prev_I.Lchild := new Node_Type'(J, Rlcp, Right, null, null); end if; elsif Llcp > Rlcp then Prev_I.Rchild := new Node_Type'(J, Llcp, Left, null, null); else Prev_I.Rchild := new Node_Type'(J, Rlcp, Right, null, null); end if; end Insert_In_Sbst; -------------------------------------------------------------------- procedure Build_Sbst (S : in String; T : out Tree_Type ) is begin T := new Node_Type'(1, 0, Right, null, null); for K in 2..S'Length loop Insert_In_Sbst (T, S, K); end loop ; end Build_Sbst ; -------------------------------------------------------------------- procedure Destroy_Sbst (T : in out Tree_Type) is procedure Rec_Destroy(T : in out Tree_Type) is begin if T /= null then Rec_Destroy(T.Lchild); Rec_Destroy(T.Rchild); Give_Back_Node(T); end if; end Rec_Destroy; begin Rec_Destroy(T); T := null; end Destroy_Sbst ; -------------------------------------------------------------------- procedure Build_Gen_Sbst (S1, S2 : in String; T : out Tree_Type) is begin Build_Sbst(S1 & Unused_Char(S2) & S2, T); end Build_Gen_Sbst; -------------------------------------------------------------------- procedure Build_Partial_Sbst (S : in String; T : out Tree_Type; C : in Char_Set_Type) is J : Natural; begin J := 1; while not Is_Member(S(J), C) loop J := J + 1; end loop; T := new Node_Type'(J, 0, Right, null, null); for K in J+1..S'Length loop if not Is_Member(S(K-1),C) and Is_Member(S(K),C) then Insert_In_Sbst(T, S, K); end if; end loop; end Build_Partial_Sbst; ------------------------------------------------------------------- procedure Build_Gen_Partial_Sbst (S1, S2 : in String; T : out Tree_Type; C : in Char_Set_Type) is begin Build_Partial_Sbst(S1 & Unused_Char(S2) & S2, T, C); end Build_Gen_Partial_Sbst; ------------------------------------------------------------------- procedure Sbst_To_Isa(T : in Tree_Type; A : out Sa_Type) is -- transform suffix binary search tree T to -- 'intermediate' form of suffix array A I : Natural := 0; -- inorder position of current node Pred: Tree_Type; -- inorder predecessor of current node Anc : Tree_Type; -- 'best' ancestor of Pred procedure Traverse(T : in Tree_Type; K : in Tree_Type) is -- traverse the subtree rooted at T; -- K is the 'best' ancestor of T, i.e. T's nearest ancestor such that -- the sub value of K is undefined, or -- the sub value of K is Left and T is in the Right subtree of K, or -- the sub value of K is Right and T is in the Left subtree of K begin if T /= null then if T.Lcp = 0 then Traverse(T.LChild, T); elsif T.Sub = Right then Traverse(T.LChild, T); else Traverse(T.LChild, K); end if; I := I + 1; A(I).Suffix := T.Suffix; if I > 1 then if T.LChild = null then if T.Sub = Right then A(I-1).Lcp := T.Lcp; else A(I-1).Lcp := K.Lcp; end if; else if Pred.Sub = Left then A(I-1).Lcp := Pred.Lcp; else A(I-1).Lcp := Anc.Lcp; end if; end if; end if; Pred := T; Anc := K; if T.Lcp = 0 then Traverse(T.RChild, T); elsif T.Sub = Left then Traverse(T.RChild, T); else Traverse(T.RChild, K); end if; end if; end Traverse; begin Traverse(T, null); A(A'Last).Lcp := 0; end Sbst_To_Isa; -------------------------------------------------------------------- procedure Isa_To_Sa(A : in out Sa_Type) is -- transform 'intermediate' form of -- suffix array A to final form procedure Process(Low, High : in Integer) is Mid, Temp : Integer; begin if High - Low > 1 then Mid := (Low + High) / 2; Process(Low, Mid); Process(Mid, High); if Low = A'First-1 then A(Mid).Dir := Left; elsif High = A'Last+1 or else A(Low).Lcp > A(Mid).Lcp then Temp := A(Low).Lcp; A(Low).Lcp := A(Mid).Lcp; A(Mid).Lcp := Temp; A(Mid).Dir := Right; else A(Mid).Dir := Left; end if; end if; end Process; begin Process(A'First-1, A'Last+1); end Isa_To_Sa; -------------------------------------------------------------- procedure Build_Sa(S : in String; A : out Sa_Type) is T : Tree_Type; begin Build_Sbst(S, T); Sbst_To_Isa(T,A); Destroy_Sbst(T); Isa_To_SA(A); end Build_Sa; -------------------------------------------------------------------- procedure Build_Gen_Sa (S1, S2 : in String; A : out Sa_Type) is T : Tree_Type; begin Build_Gen_SBST(S1, S2, T); Sbst_To_Isa(T,A); Destroy_Sbst(T); Isa_To_SA(A); end Build_Gen_Sa; -------------------------------------------------------------------- procedure Build_Partial_Sa (S : in String; A : out Sa_Type; C : in Char_Set_Type; N : out Natural) is T : Tree_Type; begin Build_Partial_Sbst(S, T, C); N := Size(T); Sbst_To_Isa(T,A(1..N)); Destroy_Sbst(T); Isa_To_SA(A(1..N)); end Build_Partial_Sa; -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - procedure Build_Gen_Partial_Sa (S1, S2 : in String; A : out Sa_Type; C : in Char_Set_Type; N : out Natural) is T : Tree_Type; begin Build_Gen_Partial_Sbst(S1, S2, T, C); N := Size(T); Sbst_To_Isa(T,A(1..N)); Destroy_Sbst(T); Isa_To_SA(A(1..N)); end Build_Gen_Partial_Sa; -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - procedure Search_Left_and_Rightmost(A : in Sa_Type; S, X : in String; Pos_L, Pos_R : out Natural) is -- Searches the suffix array A, of string S, for a target -- string X; returns the leftmost position in A of a suffix that has -- X as a prefix, or zero if X is not a substring of S F, Lt, Rt, Mid, Limit, N, K, Pos, Temp_L, Temp_R : Natural; Found : Boolean := False; L, Llcp, Rlcp : Natural := 0; D : Dirn_Type; begin Found := False; Pos := 0; Lt := A'First-1; Rt := A'Last+1; while not Found and Rt - Lt > 1 loop Mid := (Lt + Rt) / 2; D := A(Mid).Dir; L := A(Mid).Lcp; if L > Integer'Max(Llcp, Rlcp) then if D = Left then Rt := Mid; else Lt := Mid; end if; elsif Llcp > Rlcp then if Llcp > L then if D = Left then Rlcp := L; end if; Lt := Mid; elsif Llcp = L and D = Right then Lt := Mid; end if; elsif Rlcp > Llcp then if Rlcp > L then if D = Right then Llcp := L; end if; Rt := Mid; elsif Rlcp = L and D = Left then Rt := Mid; end if; end if; if (Lt /= Mid and Rt /= Mid) then -- need to compare characters F := A(Mid).Suffix; K := L; N := S'Length - F + 1; Limit := Integer'Min(X'Length, N); while K < Limit and then X(K + X'First) = S(F + K) loop K := K + 1; end loop; if K = X'Length then Found := True; Pos := Mid; elsif K = N or else X(K+X'First) > S(F+K) then Rlcp := K; Lt := Mid; else Llcp := K; Rt := Mid; end if; end if; end loop; Pos_L := Pos; Pos_R := Pos; if Pos /= 0 then Temp_L := Mid; Temp_R := Rt; Rt := Mid; while Rt - Lt > 1 loop Mid := (Lt + Rt) / 2; D := A(Mid).Dir; L := A(Mid).Lcp; if L < X'Length or else D = Right then Lt := Mid; else Pos_L := Mid; Rt := Mid; end if; end loop; Rt := Temp_R; Lt := Temp_L; while Rt - Lt > 1 loop Mid := (Lt + Rt) / 2; D := A(Mid).Dir; L := A(Mid).Lcp; if L < X'Length or else D = Left then Rt := Mid; else Pos_R := Mid; Lt := Mid; end if; end loop; end if; end Search_Left_and_Rightmost; -------------------------------------------------------------------- procedure Search_Sa(A : in Sa_Type; S, X : in String; Found : out Boolean; Pos : out Natural) is Pos_L, Pos_R : Natural; begin Search_Left_and_Rightmost(A, S, X, Pos_L, Pos_R); if Pos_L = 0 then Found := False; else Found := True; Pos := A(Pos_L).Suffix; end if; end Search_Sa; -------------------------------------------------------------------- procedure Fully_Search_Sa(A : in Sa_Type; S, X : in String; Pos_List : out Integer_List_Type) is Pos_L, Pos_R : Natural; begin Create_Empty_List(Pos_List); Search_Left_and_Rightmost(A, S, X, Pos_L, Pos_R); if Pos_L > 0 then for I in Pos_L .. Pos_R loop Insert(A(I).Suffix, Pos_List); end loop; end if; end Fully_Search_Sa; -------------------------------------------------------------------- function Num_Occurrences(A : Sa_Type; S, X : String) return Natural is Pos_L, Pos_R : Natural; begin Search_Left_and_Rightmost(A, S, X, Pos_L, Pos_R); if Pos_L > 0 then return Pos_R - Pos_L + 1; else return 0; end if; end Num_Occurrences; -------------------------------------------------------------------- procedure Traverse_For_Lrs(A : in Sa_Type; Pos1, Pos2, Len : out Natural) is procedure Rec_Traverse(Low, High : in Integer) is Mid : Integer; begin if High - Low > 1 then Mid := (Low + High) / 2; Rec_Traverse(Low, Mid); Rec_Traverse(Mid, High); if A(Mid).Lcp > Len then Len := A(Mid).Lcp; Pos2 := A(Mid).Suffix; if A(Mid).Dir = Left then Pos1 := A(High).Suffix; else Pos1 := A(Low).Suffix; end if; end if; end if; end Rec_Traverse; begin Len := 0; Rec_Traverse(A'First-1, A'Last+1); end Traverse_For_Lrs; -------------------------------------------------------------------- procedure Traverse_For_Lcs(A : in Sa_Type; Len_S1 : in Natural; Pos1, Pos2, Len : out Natural) is procedure Rec_Traverse(Low, High : in Integer) is Mid : Integer; begin if High - Low > 1 then Mid := (Low + High) / 2; Rec_Traverse(Low, Mid); Rec_Traverse(Mid, High); if A(Mid).Lcp > Len then if A(Mid).Dir = Left then if A(Mid).Suffix in 1..Len_S1 and A(High).Suffix not in 1..Len_S1 then Len := A(Mid).Lcp; Pos1 := A(Mid).Suffix; Pos2 := A(High).Suffix - Len_S1 - 1; elsif A(Mid).Suffix not in 1..Len_S1 and A(High).Suffix in 1..Len_S1 then Len := A(Mid).Lcp; Pos2 := A(Mid).Suffix - Len_S1 - 1; Pos1 := A(High).Suffix; end if; else if A(Mid).Suffix in 1..Len_S1 and A(Low).Suffix not in 1..Len_S1 then Len := A(Mid).Lcp; Pos1 := A(Mid).Suffix; Pos2 := A(Low).Suffix - Len_S1 - 1; elsif A(Mid).Suffix not in 1..Len_S1 and A(Low).Suffix in 1..Len_S1 then Len := A(Mid).Lcp; Pos2 := A(Mid).Suffix - Len_S1 - 1; Pos1 := A(Low).Suffix; end if; end if; end if; end if; end Rec_Traverse; begin Len := 0; Rec_Traverse(A'First-1, A'Last+1); end Traverse_For_Lcs; end Sa_Standard;