flex lexer分析

flex会根据你定义的正则表达式匹配到相应字段，然后根据你定义的函数进行操作，返回相应的token。

为了了解flex如何work的，我们新建一个空的flex规则文件null.flex：

%%

然后运行命令flex null.flex生成lex.yy.c文件。

接下来逐行分析它的code：

首先是一些flex的版本相关的宏：

#define FLEX_SCANNER
#define YY_FLEX_MAJOR_VERSION 2
#define YY_FLEX_MINOR_VERSION 6
#define YY_FLEX_SUBMINOR_VERSION 0
#if YY_FLEX_SUBMINOR_VERSION > 0
#define FLEX_BETA
#endif

然后将一些type define成自己的格式：

#ifndef __STDC_LIMIT_MACROS
#define __STDC_LIMIT_MACROS 1
#endif
#include <inttypes.h>
typedef int8_t flex_int8_t;
typedef uint8_t flex_uint8_t;
typedef int16_t flex_int16_t;
typedef uint16_t flex_uint16_t;
typedef int32_t flex_int32_t;
typedef uint32_t flex_uint32_t;
#else
typedef signed char flex_int8_t;
typedef short int flex_int16_t;
typedef int flex_int32_t;
typedef unsigned char flex_uint8_t;
typedef unsigned short int flex_uint16_t;
typedef unsigned int flex_uint32_t;
#ifndef INT8_MIN
#define INT8_MIN               (-128)
#endif#ifndef INT16_MIN
#define INT16_MIN              (-32767-1)
#endif
#ifndef INT32_MIN
#define INT32_MIN              (-2147483647-1)
#endif
#ifndef INT8_MAX
#define INT8_MAX               (127)
#endif
#ifndef INT16_MAX
#define INT16_MAX              (32767)
#endif
#ifndef INT32_MAX
#define INT32_MAX              (2147483647)
#endif#ifndef UINT8_MAX
#define UINT8_MAX              (255U)
#endif
#ifndef UINT16_MAX
#define UINT16_MAX             (65535U)
#endif
#ifndef UINT32_MAX
#define UINT32_MAX             (4294967295U)
#endif

然后定义EOF的宏:

1	#define YY_NULL 0

将signed char安全转换为unsigned int的宏YY_SC_TO_UI：

1	#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c)

start condition中用的BEGIN():

1	#define BEGIN (yy_start) = 1 + 2 *

在后面的代码可以看出，这些start condition定义的关键词都被定义成了宏，比如我们有两个start condition COMMENT和STRING：

1	#define INITIAL 0#define COMMENT 1#define STRING 2

这时BEGIN(COMMENT)就等价于：

1	(yy_start) = 1 + 2 * COMMENT

之后就可以得到相应的YY_STATE：

1	#define YY_START (((yy_start) - 1) / 2)#define YYSTATE YY_START

略去其他不重要的宏，接下来看两个FILE指针yyin和yyout，可以看出yyin指向了stdin，yyout指向了stdout，分别对应了标准输入输出流：

extern FILE *yyin, *yyout;
/*a lot of code*/
#ifdef YY_STDINIT    
	yyin = stdin;
	yyout = stdout;
#else    
	yyin = (FILE *) 0;
	yyout = (FILE *) 0;
#endif

接下来的代码就可以看出，flex的lexer是使用自顶向下的表驱动预测分析法来实现匹配的，关于这里面的预测分析，LL(1)文法等定义见编译原理的相关教材。

首先要实现表驱动预测分析，就要定义读入text的方法，理论上是一个个character读入，但为了效率起见，flex将它们批量读取存入了buffer中：

#ifndef YY_INPUT
#define YY_INPUT(buf,result,max_size) \
	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
    { \
        int c = '*'; \
        size_t n; \
        for ( n = 0; n < max_size && \
                (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
            buf[n] = (char) c; \
        if ( c == '\n' ) \
            buf[n++] = (char) c; \
        if ( c == EOF && ferror( yyin ) ) \
            YY_FATAL_ERROR( "input in flex scanner failed" ); \
        result = n; \
    } \
    else \
    { \
    	errno=0; \
        while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \
        { \
        	if( errno != EINTR) \
            { \
            	YY_FATAL_ERROR( "input in flex scanner failed" ); \
                break; \
            } \
            errno=0; \
            clearerr(yyin); \
         } \
      }\
\
 #endif

这里就要吐槽一下它的缩进了，简直惨不忍睹。如果你觉得这种读取方法很复杂的话，可以rewrite自己的YY_INPUT：

#undef YY_INPUT
#define YY_INPUT(buf,result,max_size) \
    if ( (result = fread( (char*)buf, sizeof(char), max_size, fin)) < 0) \
    	YY_FATAL_ERROR( "read() in flex scanner failed");

这段代码引自参考文献[1]。

为什么说它用的是表驱动法呢？从它在静态区cache了一些预测分析表的常量：

static yyconst flex_int16_t yy_accept[6] = {...};
static yyconst YY_CHAR yy_ec[256] = {...};
static yyconst YY_CHAR yy_meta[2] = {...};
static yyconst flex_uint16_t yy_base[7] = {...};
static yyconst flex_int16_t yy_def[7] = {...};
static yyconst flex_uint16_t yy_nxt[5] = {...};
static yyconst flex_int16_t yy_chk[5] = {...};

以及定义了将缓冲区变量压栈和出栈的一系列操作：

void yypush_buffer_state (YY_BUFFER_STATE new_buffer )
{/*a lof of code*/}
void yypop_buffer_state (void)
{/*a lof of code*/}
static void yyensure_buffer_stack (void)
{/*a lof of code*/}

再对比下自顶向下表驱动法的算法结构：

就一目了然了。

接下来就来看这个算法flex具体是如何实现的，它过程就在于yylex()这个函数，大致的框架如下：

#define YY_DECL int yylex (void)
/*a lot of code*/
YY_DECL
{    // initialize
    // crate stack
    // load buffer
    while(1)
    {
        yy_match:
        /* 在这里不断进行状态转移，直至无法继续转移 */
        /* 用到了前面提的YY_SC_TO_UI*/
        do{
            YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ;
            if ( yy_accept[yy_current_state] )
            {
                (yy_last_accepting_state) = yy_current_state;
                (yy_last_accepting_cpos) = yy_cp;
            }
            while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
            {
                yy_current_state = (int) yy_def[yy_current_state];
                if ( yy_current_state >= 6 )
                    yy_c = yy_meta[(unsigned int) yy_c];
            }				
            yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
            ++yy_cp;
        }			
        while ( yy_base[yy_current_state] != 3 );
        yy_find_action:
        /*根据yy_current_state返回对应的yy_act*/
        /*检查是否停止在接受状态(yy_act==0的情况)*/
        /*yy_act == 1 表示accept*/
        /*具体见do_action*/
        yy_act = yy_accept[yy_current_state];
        if ( yy_act == 0 )
        { 
            /* have to back up */
            yy_cp = (yy_last_accepting_cpos);
            yy_current_state = (yy_last_accepting_state);
            yy_act = yy_accept[yy_current_state];
        }
        YY_DO_BEFORE_ACTION;
        do_action:
        /*处理各种yy_act*/
        switch(yy_act){
            case 0: /*读到EOF*/
            case 1: /*accept*/
        }
    }
}

这里的do_action中，yy_act只有0和1两种情况，即读到非EOF的直接accept，读到EOF转case 0。因为这是一个空的规则文件，假如我们用带有规则的xxx.flex生成code，这些规则中的正则表达式匹配方法会表现到静态区中cache的预测分析表中，匹配到的action会体现在这个switch-case语句中。

为了验证以上的说明，我们定义一个用来匹配能够解释为十进制数字符串的规则,放到文件test.flex：

WHITE   " "|\t|\f|\r|\v
%%
{WHITE}*[+-]?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))([Ee][+-]?[0-9]+)?{WHITE}* {return true;}
. {return false;}
%%

对应的DFA图如下：

这个例子来自参考文献[1].

flex为了减少内存开销，对原来的状态表进行了压缩，因此在空规则生成的文件中，我们能看到很多个静态区常量(见上面)，为了看到原本的状态表，我们使用 -Cf进行编译：flex -Cf test.flex ,然后可以看到：

1 2	static yyconst flex_int16_t yy_nxt[][128] = {...}; static yyconst flex_int16_t yy_accept[9] = {...};

其中flex_int16_t yy_nxt[][128]就是原来的状态表。在yylex()中，匹配的过程是这么写的：

yy_match:		
	while ( (yy_current_state = yy_nxt[yy_current_state][ YY_SC_TO_UI(*yy_cp) ]) > 0 )
    {
        if ( yy_accept[yy_current_state] )
        {
            (yy_last_accepting_state) = yy_current_state;
            (yy_last_accepting_cpos) = yy_cp;
        }
        ++yy_cp;
    }
	yy_current_state = -yy_current_state;
yy_find_action:
    yy_act = yy_accept[yy_current_state];
    YY_DO_BEFORE_ACTION;

现在我们只要照着它稍微修改一下，就能得到一个判断字符串是否能作为数字的程序了：

#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c)
#define yyconst const
typedef int flex_int16_t;
typedef int flex_int32_t;
typedef unsigned char YY_CHAR;
static yyconst flex_int16_t yy_nxt[][128] = {...};
static yyconst flex_int16_t yy_accept[21] = {...};
bool isNumber(string s) 
{    
    int yy_current_state = 1;
    for ( int i = 0; i < s.length(); ++i ) {
        yy_current_state = yy_nxt[yy_current_state][YY_SC_TO_UI(s[i])];
        if ( yy_current_state < 0 ) 
            return false;
    }    
    return yy_accept[yy_current_state] == 1;
}

这里只需要判断新的current state是否大于0.没用必要查询是否accept(因为match时小于0直接跳出循环，说明匹配失败)。

对于压缩版本的，也可以有如下的对应。

yylex()：

yy_match:
    do			
    {
        YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ;
        if ( yy_accept[yy_current_state] )
        {
            (yy_last_accepting_state) = yy_current_state;
            (yy_last_accepting_cpos) = yy_cp;
        }
        while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
        {
            yy_current_state = (int) yy_def[yy_current_state];
            if ( yy_current_state >= 22 )
                yy_c = yy_meta[(unsigned int) yy_c];
        }
        yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
        ++yy_cp;
    }		
    while ( yy_base[yy_current_state] != 43 );
yy_find_action:
    yy_act = yy_accept[yy_current_state];
    if ( yy_act == 0 )
    { 
        /* have to back up */
        yy_cp = (yy_last_accepting_cpos);
        yy_current_state = (yy_last_accepting_state);
        yy_act = yy_accept[yy_current_state];
    }		
    YY_DO_BEFORE_ACTION;

valid_number程序：

#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c)
#define yyconst consttypedef int flex_int16_t;
typedef int flex_int32_t;
typedef unsigned char YY_CHAR;
static yyconst flex_int16_t yy_accept[6] = {...};static yyconst YY_CHAR yy_ec[256] = {...};
static yyconst YY_CHAR yy_meta[2] = {...};static yyconst flex_uint16_t yy_base[7] = {...};
static yyconst flex_int16_t yy_def[7] = {...};
static yyconst flex_uint16_t yy_nxt[5] = {...};
static yyconst flex_int16_t yy_chk[5] = {...};
bool isNumber(string s){
    int yy_current_state = 1;
    for(int i = 0; i < s.length(); ++i){
        YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ;
        while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
        {
            yy_current_state = (int) yy_def[yy_current_state];
            if ( yy_current_state >= 22 )
                yy_c = yy_meta[(unsigned int) yy_c];
        }
        yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
        return(yy_accept[yy_current_state]);
}

这里需要判断current state是否能被accept(即返回1)，当然根据while循环条件也可以。

References：

[1].Flex技巧总结 && [LeetCode]Valid Number题解 https://blog.finaltheory.me/research/Flex-Tricks.html