-- package to construct and manipulate suffix arrays -- Authors: Rob Irving, Lorna Love -- Final version: 24 January 2001 -- This version constructs the suffix array by means of -- the refined version of the suffix binary search tree -- represented using dynamically created nodes with Ada.Text_Io; use Ada.Text_Io; with Char_Set_Package; use Char_Set_Package; with Ada.Integer_Text_Io; use Ada.Integer_Text_Io; with Unchecked_Deallocation; package body Sa_Refined is pragma Suppress(All_Checks); type Node_Type; type Tree_Type is access Node_Type; type Node_Type is record Suffix : Natural; Lcp : Natural; Sub : Dirn_Type; Lchild, Rchild : Tree_Type; Lex_Closest, Suff_Link : Tree_Type; end record; procedure Give_Back_Node is new Unchecked_Deallocation(Node_Type, Tree_Type); -------------------------------------------------------------------- function Unused_Char (S : String) return Character is -- returns an ASCII character that does not appear in S, -- giving priority to printable characters; in the case where there -- is no such character, raises the exception No_Dummy_Char_Available Cs : Char_Set_Type; Next : Integer; C : Character; First_Printable : constant := 32; Last_Printable : constant := 127; First_Non_Printable : constant := 0; Last_Non_Printable : constant := 31; begin Cs := Empty_Char_Set; for I in S'range loop Add(S(I), Cs); end loop; Next := First_Printable; while Next <= Last_Printable loop C := Character'Val(Next); if not Is_Member(C, Cs) then return C; else Next := Next + 1; end if; end loop; Next := Last_Non_Printable; while Next >= First_Non_Printable loop C := Character'Val(Next); if not Is_Member(C, Cs) then return C; else Next := Next - 1; end if; end loop; raise No_Dummy_Char_Available; end Unused_Char; --------------------------------------------------------------------- procedure Output_Sa(Sa : in Sa_Type; F_Name : in String) is F : File_Type; begin Create(F, Mode => Out_File, Name => F_Name); for I in Sa'range loop Put(F, Sa(I).Suffix, Width => 3); Put(F, Sa(I).Lcp, Width => 3); if Sa(I).Dir = Left then Put(F, "Left"); else Put(F, "Right"); end if; New_Line(F); end loop; Close(F); end Output_Sa; -------------------------------------------------------------------- procedure Insert_In_Sbst (T : in out Tree_Type; S : in String; J : in Natural; Prev_Node : in Tree_Type; New_Node : out Tree_Type) is -- Inserts the suffix S(J..S'Length) into T, the suffix binary search -- tree for S. Prev_node accesses the previously inserted leaf node -- and New_Node returns an access to the new leaf node created -------------------------------------------------------------------- procedure Branch_Left (T : in out Tree_Type; D : in out Dirn_Type) is begin T := T.LChild; D := Left; end Branch_Left; -------------------------------------------------------------------- procedure Branch_Right (T : in out Tree_Type; D : in out Dirn_Type) is begin T := T.RChild; D := Right; end Branch_Right; -------------------------------------------------------------------- K, M, N : Natural; Llcp, Rlcp : Natural := 0; Dir : Dirn_Type; I, Prev_I, Start_Posn, Closest_Link : Tree_Type; begin if Prev_Node.Lcp <= 1 then -- no useful info, so just start at the root Start_Posn := T; else Start_Posn := Prev_Node.Lex_Closest.Suff_Link; if Start_Posn.Lcp < Prev_Node.Lcp - 1 then -- insert in this subtree Prev_I := Start_Posn; if Prev_Node.Sub = Left then Branch_Left(Start_Posn, Dir); Llcp := Prev_Node.Lcp - 1; else Branch_Right(Start_Posn, Dir); Rlcp := Prev_Node.Lcp - 1; end if; else while Start_Posn.Lcp >= Prev_Node.Lcp - 1 loop Start_Posn := Start_Posn.Lex_Closest; end loop; M := Prev_Node.Lcp - 1; while J + M <= S'Length and then S(J+M) = S(Start_Posn.Suffix+M) loop M := M + 1; end loop; Prev_I := Start_Posn; if J + M > S'Length or else S(J+M) < S(Start_Posn.Suffix+M) then Llcp := M; Branch_Left(Start_Posn, Dir); else Rlcp := M; Branch_Right(Start_Posn, Dir); end if; end if; end if; I := Start_Posn; Closest_Link := Prev_I; M := S'Length - J + 1; while I /= null loop Prev_I := I; if I.Lcp > Integer'Max(Llcp, Rlcp) then if I.Sub = Left then Branch_Left(I, Dir); else Branch_Right(I, Dir); end if; elsif Llcp > Rlcp then if Llcp > I.Lcp then if I.Sub = Left then Rlcp := I.Lcp; if Rlcp >= Llcp then Closest_Link := I; end if; end if; Branch_Right(I, Dir); elsif Llcp = I.Lcp and I.Sub = Right then Branch_Right(I, Dir); end if; elsif Rlcp > Llcp then if Rlcp > I.Lcp then if I.Sub = Right then Llcp := I.Lcp; if Llcp >= Rlcp then Closest_Link := I; end if; end if; Branch_Left(I, Dir); elsif Rlcp = I.Lcp and I.Sub = Left then Branch_Left(I, Dir); end if; end if; if I = Prev_I then -- haven't yet branched, so compare characters K := I.Lcp; N := S'Length - I.Suffix + 1; while K < Integer'Min(M,N) and then S(J+K) = S(I.Suffix+K) loop K := K + 1; end loop; if K = M then Llcp := K; if Llcp >= Rlcp then Closest_Link := I; end if; Branch_Left(I, Dir); elsif K = N then Rlcp := K; if Rlcp >= Llcp then Closest_Link := I; end if; Branch_Right(I, Dir); elsif S(J + K) < S(I.Suffix + K) then Llcp := K; if Llcp >= Rlcp then Closest_Link := I; end if; Branch_Left(I, Dir); else Rlcp := K; if Rlcp >= Llcp then Closest_Link := I; end if; Branch_Right(I, Dir); end if; end if; end loop; if Dir = Left then if Llcp > Rlcp then New_Node := new Node_Type'(J, Llcp, Left, null, null, Closest_Link, null); Prev_I.Lchild := New_Node; else New_Node := new Node_Type'(J, Rlcp, Right, null, null, Closest_Link, null); Prev_I.Lchild := New_Node; end if; elsif Llcp > Rlcp then New_Node := new Node_Type'(J, Llcp, Left, null, null, Closest_Link, null); Prev_I.Rchild := New_Node; else New_Node := new Node_Type'(J, Rlcp, Right, null, null, Closest_Link, null); Prev_I.Rchild := New_Node; end if; Prev_Node.Suff_Link := New_Node; end Insert_In_Sbst; -------------------------------------------------------------------- procedure Build_Sbst (S : in String; T : out Tree_Type) is X, Y : Tree_Type; begin T := new Node_Type'(1, 0, Right, null, null, null, null); Y := T; for K in 2..S'Length loop Insert_In_Sbst (T, S, K, Y, X); Y := X; end loop ; end Build_Sbst; -------------------------------------------------------------------- procedure Destroy_Sbst (T : in out Tree_Type) is procedure Rec_Destroy(T : in out Tree_Type) is begin if T /= null then Rec_Destroy(T.Lchild); Rec_Destroy(T.Rchild); Give_Back_Node(T); end if; end Rec_Destroy; begin Rec_Destroy(T); T := null; end Destroy_Sbst ; -------------------------------------------------------------------- procedure Build_Gen_Sbst (S1, S2 : in String; T : out Tree_Type) is begin Build_Sbst(S1 & Unused_Char(S2) & S2, T); end Build_Gen_Sbst; -------------------------------------------------------------------- procedure Sbst_To_Isa(T : in Tree_Type; A : out Sa_Type) is -- transform suffix binary search tree T to -- 'intermediate' form of suffix array A I : Natural := 0; -- inorder position of current node Pred: Tree_Type; -- inorder predecessor of current node Anc : Tree_Type; -- 'best' ancestor of Pred procedure Traverse(T : in Tree_Type; K : in Tree_Type) is -- traverse the subtree rooted at T; -- K is the 'best' ancestor of T, i.e. T's nearest ancestor such that -- the sub value of K is undefined, or -- the sub value of K is Left and T is in the Right subtree of K, or -- the sub value of K is Right and T is in the Left subtree of K begin if T /= null then if T.Lcp = 0 then Traverse(T.LChild, T); elsif T.Sub = Right then Traverse(T.LChild, T); else Traverse(T.LChild, K); end if; I := I + 1; A(I).Suffix := T.Suffix; if I > 1 then if T.LChild = null then if T.Sub = Right then A(I-1).Lcp := T.Lcp; else A(I-1).Lcp := K.Lcp; end if; else if Pred.Sub = Left then A(I-1).Lcp := Pred.Lcp; else A(I-1).Lcp := Anc.Lcp; end if; end if; end if; Pred := T; Anc := K; if T.Lcp = 0 then Traverse(T.RChild, T); elsif T.Sub = Left then Traverse(T.RChild, T); else Traverse(T.RChild, K); end if; end if; end Traverse; begin Traverse(T, null); A(A'Last).Lcp := 0; end Sbst_To_Isa; -------------------------------------------------------------------- procedure Isa_To_Sa(A : in out Sa_Type) is -- transform 'intermediate' form of -- suffix array A to final form procedure Process(Low, High : in Integer) is Mid, Temp : Integer; begin if High - Low > 1 then Mid := (Low + High) / 2; Process(Low, Mid); Process(Mid, High); if Low = A'First-1 then A(Mid).Dir := Left; elsif High = A'Last+1 or else A(Low).Lcp > A(Mid).Lcp then Temp := A(Low).Lcp; A(Low).Lcp := A(Mid).Lcp; A(Mid).Lcp := Temp; A(Mid).Dir := Right; else A(Mid).Dir := Left; end if; end if; end Process; begin Process(A'First-1, A'Last+1); end Isa_To_Sa; -------------------------------------------------------------- procedure Build_Sa(S : in String; A : out Sa_Type) is T : Tree_Type; begin Build_Sbst(S, T); Sbst_To_Isa(T, A); --Output_Sa(A, "Inter.txt"); Destroy_Sbst(T); Isa_To_SA(A); end Build_Sa; -------------------------------------------------------------------- procedure Build_Gen_Sa (S1, S2 : in String; A : out Sa_Type) is T : Tree_Type; begin Build_Gen_SBST(S1, S2, T); Sbst_To_Isa(T,A); Destroy_Sbst(T); Isa_To_SA(A); end Build_Gen_Sa; -------------------------------------------------------------------- procedure Search_Left_and_Rightmost(A : in Sa_Type; S, X : in String; Pos_L, Pos_R : out Natural) is -- Searches the suffix array A, of string S, for a target -- string X; returns the leftmost position in A of a suffix that has -- X as a prefix, or zero if X is not a substring of S F, Lt, Rt, Mid, Limit, N, K, Pos, Temp_L, Temp_R : Natural; Found : Boolean := False; L, Llcp, Rlcp : Natural := 0; D : Dirn_Type; begin Found := False; Pos := 0; Lt := A'First-1; Rt := A'Last+1; while not Found and Rt - Lt > 1 loop Mid := (Lt + Rt) / 2; D := A(Mid).Dir; L := A(Mid).Lcp; if L > Integer'Max(Llcp, Rlcp) then if D = Left then Rt := Mid; else Lt := Mid; end if; elsif Llcp > Rlcp then if Llcp > L then if D = Left then Rlcp := L; end if; Lt := Mid; elsif Llcp = L and D = Right then Lt := Mid; end if; elsif Rlcp > Llcp then if Rlcp > L then if D = Right then Llcp := L; end if; Rt := Mid; elsif Rlcp = L and D = Left then Rt := Mid; end if; end if; if (Lt /= Mid and Rt /= Mid) then -- need to compare characters F := A(Mid).Suffix; K := L; N := S'Length - F + 1; Limit := Integer'Min(X'Length, N); while K < Limit and then X(K + X'First) = S(F + K) loop K := K + 1; end loop; if K = X'Length then Found := True; Pos := Mid; elsif K = N or else X(K+X'First) > S(F+K) then Rlcp := K; Lt := Mid; else Llcp := K; Rt := Mid; end if; end if; end loop; Pos_L := Pos; Pos_R := Pos; if Pos /= 0 then Temp_L := Mid; Temp_R := Rt; Rt := Mid; while Rt - Lt > 1 loop Mid := (Lt + Rt) / 2; D := A(Mid).Dir; L := A(Mid).Lcp; if L < X'Length or else D = Right then Lt := Mid; else Pos_L := Mid; Rt := Mid; end if; end loop; Rt := Temp_R; Lt := Temp_L; while Rt - Lt > 1 loop Mid := (Lt + Rt) / 2; D := A(Mid).Dir; L := A(Mid).Lcp; if L < X'Length or else D = Left then Rt := Mid; else Pos_R := Mid; Lt := Mid; end if; end loop; end if; end Search_Left_and_Rightmost; -------------------------------------------------------------------- procedure Search_Sa(A : in Sa_Type; S, X : in String; Found : out Boolean; Pos : out Natural) is Pos_L, Pos_R : Natural; begin Search_Left_and_Rightmost(A, S, X, Pos_L, Pos_R); if Pos_L = 0 then Found := False; else Found := True; Pos := A(Pos_L).Suffix; end if; end Search_Sa; -------------------------------------------------------------------- procedure Fully_Search_Sa(A : in Sa_Type; S, X : in String; Pos_List : out Integer_List_Type) is Pos_L, Pos_R : Natural; begin Create_Empty_List(Pos_List); Search_Left_and_Rightmost(A, S, X, Pos_L, Pos_R); if Pos_L > 0 then for I in Pos_L .. Pos_R loop Insert(A(I).Suffix, Pos_List); end loop; end if; end Fully_Search_Sa; -------------------------------------------------------------------- function Num_Occurrences(A : Sa_Type; S, X : String) return Natural is Pos_L, Pos_R : Natural; begin Search_Left_and_Rightmost(A, S, X, Pos_L, Pos_R); if Pos_L > 0 then return Pos_R - Pos_L + 1; else return 0; end if; end Num_Occurrences; -------------------------------------------------------------------- procedure Traverse_For_Lrs(A : in Sa_Type; Pos1, Pos2, Len : out Natural) is procedure Rec_Traverse(Low, High : in Integer) is Mid : Integer; begin if High - Low > 1 then Mid := (Low + High) / 2; Rec_Traverse(Low, Mid); Rec_Traverse(Mid, High); if A(Mid).Lcp > Len then Len := A(Mid).Lcp; Pos2 := A(Mid).Suffix; if A(Mid).Dir = Left then Pos1 := A(High).Suffix; else Pos1 := A(Low).Suffix; end if; end if; end if; end Rec_Traverse; begin Len := 0; Rec_Traverse(A'First-1, A'Last+1); end Traverse_For_Lrs; -------------------------------------------------------------------- procedure Traverse_For_Lcs(A : in Sa_Type; Len_S1 : in Natural; Pos1, Pos2, Len : out Natural) is procedure Rec_Traverse(Low, High : in Integer) is Mid : Integer; begin if High - Low > 1 then Mid := (Low + High) / 2; Rec_Traverse(Low, Mid); Rec_Traverse(Mid, High); if A(Mid).Lcp > Len then if A(Mid).Dir = Left then if A(Mid).Suffix in 1..Len_S1 and A(High).Suffix not in 1..Len_S1 then Len := A(Mid).Lcp; Pos1 := A(Mid).Suffix; Pos2 := A(High).Suffix - Len_S1 - 1; elsif A(Mid).Suffix not in 1..Len_S1 and A(High).Suffix in 1..Len_S1 then Len := A(Mid).Lcp; Pos2 := A(Mid).Suffix - Len_S1 - 1; Pos1 := A(High).Suffix; end if; else if A(Mid).Suffix in 1..Len_S1 and A(Low).Suffix not in 1..Len_S1 then Len := A(Mid).Lcp; Pos1 := A(Mid).Suffix; Pos2 := A(Low).Suffix - Len_S1 - 1; elsif A(Mid).Suffix not in 1..Len_S1 and A(Low).Suffix in 1..Len_S1 then Len := A(Mid).Lcp; Pos2 := A(Mid).Suffix - Len_S1 - 1; Pos1 := A(Low).Suffix; end if; end if; end if; end if; end Rec_Traverse; begin Len := 0; Rec_Traverse(A'First-1, A'Last+1); end Traverse_For_Lcs; end Sa_Refined;