#ifndef RXH
#define RXH

/*	Copyright (C) 1992 Free Software Foundation, Inc.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this software; see the file COPYING.  If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
/*  t. lord	Wed Sep 23 18:20:57 1992	*/

#include <sys/types.h>
#include "regex.h"
#include "bitset.h"
#include "sp.h"


#ifndef RXDEBUG
#define RXDEBUG 1
#endif
#if RXDEBUG
#include <stdio.h>
#endif



/* Struct RX holds a compiled regular expression - that is, an nfa ready to be
 * converted on demand to a dfa.  Most functions declared here take a
 * `struct rx *' as the first argument.
 */

struct rx
{
  int locks;

  /* Every regex defines the size of its own character set. */
  int local_cset_size;

  void * buffer;		/* Malloced memory for the nfa. */
  unsigned long allocated;	/* Size of that memory. */

  /* How much buffer space to save for external uses.  After compilation,
   * this space will be available at (buffer + allocated - reserved)
   */
  unsigned long reserved; 

  /* --------- The remaining fields are for internal use only. --------- */
  /* --------- But! they should be initialized to 0.	       --------- */
  /* NODEC is the number of nodes in the NFA with non-epsilon
   * out transitions. 
   */
  int nodec;

  /* EPSNODEC is the number of nodes with only epsilon (out) transitions. */
  int epsnodec;

  /* The sum of NODEC & EPSNODEC is the total number of states in the
   * compiled NFA.
   */

  /* side_effect_progs temporarily holds a tree of side effect lists. */
  struct sp_tree * se_memo;

  /* A memo for sets of states in the possible_future lists of an nfa: */
  struct sp_tree * nfa_set_memo;

  /* The root of the memo of superstates during evaluation. */
  struct superstate_contents * nil_set;

  /* The queue of superstates during evaluation. (see rxrun.h) */
  struct superstate * min_free_superstate; /* ->next_recyclable leads to. */
  struct superstate * max_free_superstate; /* ->next_... is `max_unfree' */

  int superstate_count;		/* for control of the cache. */
  int superstate_semifree;
  int superstate_hits;
  int superstate_misses;

  /* The instruction table is indexed by the enum of instructions defined in 
   * rxrun.h.  The values in the table are used to fill in the `inx'
   * slot of instruction frames (see rxrun.h).
   */
  void ** instruction_table;
  struct nfa_state *nfa_states;
  struct nfa_state *start;	/* For rxcompat.c */
};

/* These functions manipulate bitsets of size RX->LOCAL_CSET_SIZE. */
extern ut_Bitset cset P_ ((struct rx *));
extern void free_cset P_ ((ut_Bitset));
extern ut_Bitset copy_cset P_ ((struct rx *, ut_Bitset));


/* An RX NFA may contain epsilon edges labeled with side effects.
 * These side effects represent match actions that can not normally be
 * defined in a `pure' NFA; for example, recording the location at
 * which a paren is crossed in a register structure.  For the rx
 * library, these side effects are opaque values (supporting only ==).
 * Semaniticly, a matcher is supposed to find a particular path
 * through the NFA (such as leftmost-longest), and then to execute the
 * side effects along that path.  Operationally, the space of paths is
 * searched and side effects are carried out incrementally, and with
 * backtracking.
 *
 * As the NFA is converted to a DFA, we move from states to sets of
 * states, and from side effects to sets of side effects.  Simple
 * lists are used to hold side effect lists.
 */

typedef void * rx_side_effect;

struct rx_side_effect_list 
{
  rx_side_effect car;
  struct rx_side_effect_list * cdr;
};



/* Struct rexp_node holds an expression tree that represents a regexp.
 * In this expression tree, every node has a type, and some parameters
 * appropriate to that type.
 */

enum rexp_node_type
{
  r_cset,			/* Match from a character set. `a' or `[a-z]'*/
  r_concat,			/* Concat two regexps.   `ab' */
  r_alternate,			/* Choose one of two regexps. `a\|b' */
  r_opt,			/* Optional regexp. `a?' */
  r_star,			/* Repeated regexp. `a*' */
  r_side_effect,		/* Matches the empty string, but
				 * implies that a side effect must
				 * take place.  These nodes are used
				 * by the parser to implement parens,
				 * backreferences etc.
				 */

  r_data			/* R_DATA is soley for the convenience
				 * of parsers or other rexp
				 * transformers that want to
				 * (temporarily) introduce new node
				 * types in rexp structures.  These
				 * must be eliminated
			    	 * by the time build_nfa is called.
			  	 */
};

struct rexp_node
{
  enum rexp_node_type type;
  union
  {
    ut_Bitset cset;
    rx_side_effect side_effect;
    struct
      {
	struct rexp_node *left;
	struct rexp_node *right;
      } pair;
    void * data;
  } params;
};

extern struct rexp_node * rexp_node P_((struct rx *rx,
					enum rexp_node_type type));
extern struct rexp_node * copy_rexp P_((struct rx *, struct rexp_node *));
extern void free_rexp P_((struct rexp_node *));

#if RXDEBUG
typedef void (*side_effect_printer) P_((struct rx *,
					rx_side_effect, FILE *));
extern void print_rexp P_((struct rx *, struct rexp_node *,
			   int indent_depth, side_effect_printer, FILE * fp));
#endif



/* This defines the structure of the NFA into which rexps are compiled. */

struct nfa_state
{
  int id;		
  struct nfa_edge *edges;	/* Edges FROM this state. */
  struct possible_future *futures; /* see ECLOSE_NFA */
  unsigned int is_final:1;
  unsigned int is_start:1;

  unsigned int mark;          /* For the convenience of graph algorithms. */
  struct nfa_state *next;
};

enum nfa_edge_type
{
  ne_cset,
  ne_epsilon,
  ne_side_effect,		/* A special kind of epsilon. */
};

struct nfa_edge
{
  struct nfa_edge *next;
  enum nfa_edge_type type;
  struct nfa_state *dest;
  union
  {
    ut_Bitset cset;
    rx_side_effect side_effect;
  } params;
};

/* When a simple NFA is converted to a DFA, one step is to compute the
 * epsilon closure of each NFA state.  In RX, the situtation is
 * slightly complicated by the presense of side effects.  
 *
 * Side effects are recorded on epsilon edges.  Therefore, a given
 * espilon-only path through the NFA may encounter certain side
 * effects.  During a match, we will need to know specificly which 
 * side effects are reached.  Therefore, rather than computing a
 * maximal epsilon closure for every node, RX computes multiple,
 * non-maximal epsilon closures.  Individually, these submaximal
 * closures are called POSSIBLE_FUTURES.
 *
 * All the epislon paths compressed into a particular POSSIBLE_FUTURE
 * encounter the same set of side effects.  No two POSSIBLE_FUTUREs
 * (for a given NFA state) involve the same side effects.
 */

struct nfa_state_set
{
  struct nfa_state * car;
  struct nfa_state_set * cdr;
};
extern struct nfa_state_set *
  nfa_set_union P_((struct rx * rx,
		    struct nfa_state_set * seta,
		    struct nfa_state_set * setb));

struct possible_future
{
  struct possible_future *next;
  struct rx_side_effect_list * effects;
  struct nfa_state_set * destset;
};

/* These low level functions are used to build an nfa attached to a specific
 * struct rx.
 */
extern struct nfa_state * nfa_state P_((struct rx * rx));
extern struct nfa_state * id_to_nfa_state P_((struct rx *, int));
extern struct nfa_edge * nfa_edge P_((struct rx * rx,
				      enum nfa_edge_type type,
				      struct nfa_state * start,
				      struct nfa_state * dest));
extern void free_nfa P_((struct rx *rx));


/*
 * BUILD_NFA: This is, more or less, the `Thompson construction'
 *  	of an NFA from a regex.  In its output, each edge is to
 *	exactly one state and some edges are epsilons.
 */
extern int build_nfa P_((struct rx * rx,
			 struct rexp_node * expression,
			 struct nfa_state ** start,
			 struct nfa_state ** end));

/* ECLOSE_NFA: This stage prepares the NFA for lazy conversion into a
 * 	DFA by the match engine.  There are several steps:
 *
 *	- NFA states are assigned (int) names.  States with only epsilon 
 * 	  transitions are given negative names.  Others non-neg.
 *
 *	- The epsilon closures (possible futures) of every state is computed.
 *
 *  	- Finally, epsilon transitions are deleted from the NFA.
 */
extern int eclose_nfa P_((struct rx *rx));

/* COMPACTIFY_NFA: This simply copies the entire nfa into a contiguous
 * 	region of memory.  This is done because the GNU regex
 * 	interface requires it, but it gives the advantage that NFA
 * 	states can be looked up with an array reference.
 */
extern int compactify_nfa P_((struct rx *rx, void **mem, unsigned long *size));


#if RXDEBUG
extern void print_nfa P_ ((struct rx *, struct nfa_state *,
			   side_effect_printer, FILE *));
#endif

#endif

