[1] Using LLVM to implement a simple language

Posted by zoozle on Sun, 06 Mar 2022 14:46:27 +0100

This article follows LLVM Tutorial The tutorial is completed with some comments. The code in this article is not an engineering best practice.

1 objectives

A simple language Kaleidoscope is implemented with LLVM to compile and run the following codes:

# Fibonacci sequence function definition
def fib(x)
    if x < 3 then
        1
    else
        fib(x - 1) + fib(x - 2)

fib(40)

# Function declaration
extern sin(arg)
extern cos(arg)
extern atan2(arg1 arg2)

# The declared function can be called
atan2(sin(.4), cos(42))

For convenience, Kaleidoscope only supports the float64 data type.

2 lexical analysis Lexer

Lexical analysis is the process of dividing the program into tokens. Token includes keywords such as if, else and int, identifiers such as age, operators such as +, -, = and numeric and string variables.

2.1 Token type

// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
// of these for known things.
enum Token {
  TOKEN_EOF = -1,

  // commands
  TOKEN_DEF = -2,
  TOKEN_EXTERN = -3,

  // primary
  TOKEN_IDENTIFIER = -4,
  TOKEN_NUMBER = -5,
};

// If the current Token is TOKEN_IDENTIFIER, record variable
static std::string g_identifier_str; 

// If the current Token is TOKEN_NUMBER, record the value
static double g_num_val;             

2.2 Token identification

///Returns the token parsed from the standard input
// TOKEN_EOF: 'def'
// TOKEN_EXTERN: 'extern'
// TOKEN_IDENTIFIER: [a-zA-Z][a-zA-Z0-9]*
// TOKEN_NUMBER: [0-9.]+
static int GetToken() {
  static int last_char = ' ';

  // Skip spaces
  while (isspace(last_char))
    last_char = getchar();

  // Identify keywords or variables [a-zA-Z][a-zA-Z0-9]*
  if (isalpha(last_char)) { 
    g_identifier_str = last_char;
    while (isalnum((last_char = getchar())))
      g_identifier_str += last_char;

    if (g_identifier_str == "def")
      return TOEKN_DEF;
    if (g_identifier_str == "extern")
      return TOKEN_EXTERN;
    return TOKEN_IDENTIFIER;
  }

  // Identification number: [0-9.]+
  if (isdigit(last_char) || last_char == '.') { 
    std::string num_str;
    do {
      num_str += last_char;
      last_char = getchar();
    } while (isdigit(last_char) || last_char == '.');

    g_num_val = strtod(num_str.c_str(), nullptr);
    return TOKEN_NUMBER;
  }

	// Skip comments
  if (last_char == '#') {
    // Comment until end of line.
    do
      last_char = getchar();
    while (last_char != EOF && last_char != '\n' && last_char != '\r');

    if (last_char != EOF)
      return GetToken();
  }

  // End of identification file
  if (last_char == EOF)
    return TOEKN_EOF;

  // Unknown, return ascii code directly
  int this_char = last_char;
  last_char = getchar();
  return this_char;
}

3 parsing Parser

Syntax analysis is to identify the syntax structure of the program based on lexical analysis. This structure is tree, which is called Abstract Syntax Tree (AST). For example, after traversing the whole expression tree, it is easy to obtain the root value from the computer.

3.1 define AST nodes

///Base class for all expression nodes
class ExprAST {
public:
  virtual ~ExprAST() {}
};

///Numerical expression
class NumberExprAST : public ExprAST {
  double val_;

public:
  NumberExprAST(double val) : val_(val) {}
};

///Variable expression
class VariableExprAST : public ExprAST {
  std::string name_;

public:
  VariableExprAST(const std::string &name) : name_(name) {}
};

///Binary operation expression
class BinaryExprAST : public ExprAST {
  char opcode_;
  std::unique_ptr<ExprAST> lhs, rhs;

public:
  BinaryExprAST(char opcode, std::unique_ptr<ExprAST> lhs,
                std::unique_ptr<ExprAST> rhs)
    : opcode_(opcode), lhs_(std::move(lhs)), rhs_(std::move(rhs)) {}
};

///Function call expression
class CallExprAST : public ExprAST {
  std::string callee_;
  std::vector<std::unique_ptr<ExprAST>> args_;

public:
  CallExprAST(const std::string &callee,
              std::vector<std::unique_ptr<ExprAST>> args)
    : callee_(callee), args_(std::move(args)) {}
};

///Function interface
///Including function name and parameter name
class PrototypeAST {
  std::string name_;
  std::vector<std::string> args_;

public:
  PrototypeAST(const std::string &name, std::vector<std::string> args)
    : name_(name), args_(std::move(args)) {}

  const std::string &name() const { return name_; }
};

///Function expression
class FunctionAST {
  std::unique_ptr<PrototypeAST> proto_;
  std::unique_ptr<ExprAST> body_;

public:
  FunctionAST(std::unique_ptr<PrototypeAST> proto,
              std::unique_ptr<ExprAST> body)
    : proto_(std::move(proto)), body_(std::move(body)) {}
};

Conditional expressions are discussed in later chapters.

3.2 analytical basic expression

For convenience, define a helper function:

// token being viewed by parser
static int g_cur_token;

// Read the next token from lexer and update g_cur_token
static int NextToken() { 
	return g_cur_token = GetToken(); 
}

Parse basic expression:

/// numberexpr ::= number
// Analyze the numerical value and construct NumberExprAST
// If the current token is TOKEN_NUMBER, get the numeric variable and create NumberExprAST, and then advance the lexer to the next token
static std::unique_ptr<ExprAST> ParseNumberExpr() {
  auto result = std::make_unique<NumberExprAST>(g_num_val);
  NextToken(); // consume the number
  return std::move(result);
}

/// parenexpr ::= '(' expression ')'
// Parse the parenthesis expression and construct ExprAST
// If the current token is' ('), get the expression in parentheses
static std::unique_ptr<ExprAST> ParseParenExpr() {
  NextToken(); // eat (.
  auto expr = ParseExpression();
  if (!expr)
    return nullptr;

  if (g_cur_token != ')')
    return LogError("expected ')'");
  NextToken(); // eat ).
  return expr;
}

/// identifierexpr
///   ::= identifier
///   ::= identifier '(' expression* ')'
// Analyze variables and construct VariableExprAST; Or parse function calls and construct CallExprAST
// If the current token is toe_ IDENTIFIER,
// A token will be read in advance to determine whether the current identifier is a variable or a function call according to whether it is' ('
static std::unique_ptr<ExprAST> ParseIdentifierExpr() {
  std::string id_name = g_identifier_str;

  NextToken();  // eat identifier.

  if (g_cur_token != '(') // Simple variable ref.
    return std::make_unique<VariableExprAST>(id_name);

  // Call.
  NextToken();  // eat (
  std::vector<std::unique_ptr<ExprAST>> args;
  if (g_cur_token != ')') {
    while (1) {
      if (auto arg = ParseExpression())
        args.push_back(std::move(arg));
      else
        return nullptr;

      if (g_cur_token == ')')
        break;

      if (g_cur_token != ',')
        return LogError("Expected ')' or ',' in argument list");
      NextToken();
    }
  }

  // Eat the ')'.
  NextToken();

  return std::make_unique<CallExprAST>(id_name, std::move(args));
}

/// primary
///   ::= identifierexpr
///   ::= numberexpr
///   ::= parenexpr
// Parsing basic expressions
static std::unique_ptr<ExprAST> ParsePrimary() {
  switch (g_cur_token) {
  default:
    return LogError("unknown token when expecting an expression");
  case TOKEN_IDENTIFIER:
    return ParseIdentifierExpr();
  case TOKEN_NUMBER:
    return ParseNumberExpr();
  case '(':
    return ParseParenExpr();
}

3.3 analytic binary expression

Next, analyze binary expressions. Kaleidoscope only supports four binary operators, and the priority from low to high is:

< + - *

Where, + and - have the same priority.

Define priority:

/// BinopPrecedence - This holds the precedence for each binary operator that is
/// defined.
static std::map<char, int> g_binop_precedence;

/// GetTokenPrecedence - Get the precedence of the pending binary operator token.
static int GetTokenPrecedence() {
  if (!isascii(g_cur_token))
    return -1;

  // Make sure it's a declared binop.
  int tok_prec = g_binop_precedence[g_cur_token];
  if (tok_prec <= 0) return -1;
  return tok_prec;
}

int main() {
  // Install standard binary operators.
  // 1 is lowest precedence.
  g_binop_precedence['<'] = 10;
  g_binop_precedence['+'] = 20;
  g_binop_precedence['-'] = 20;
  g_binop_precedence['*'] = 40;  // highest.
  ...
}

The expression a+b+(c+d)*e*f+g can be regarded as a basic expression stream separated by binary operators. First parse the first basic expression a, and then parse the remaining +, b *, e and +, g. That is, a complex expression can be abstracted as a primaryexpr followed by multiple [binop, primaryexpr] tuples.

/// expression
///   ::= primary binoprhs
static std::unique_ptr<ExprAST> ParseExpression() {
  auto lhs = ParsePrimary();
  if (!lhs)
    return nullptr;

  return ParseBinOpRHS(0, std::move(lhs));
}

For the expression a+b+(c+d)*e*f+g, a will be passed into ParseBinOpRHS, and the current token is +.

The first parameter of ParseBinOpRHS indicates the minimum priority of the operator that can be consumed. If the value is 40 and the current parsed stream is +, a, the function will not consume any token, because the priority of + is only 20, which is smaller than 40.

Because the priority of illegal binary operator is - 1, if a unary expression is encountered, the function will return directly.

/// binoprhs
///   ::= ('+' primary)*
static std::unique_ptr<ExprAST> ParseBinOpRHS(int min_precedence,
                                              std::unique_ptr<ExprAST> lhs) {
  // If this is a binop, find its precedence.
  while (true) {
    int tok_prec = GetTokenPrecedence();

    // If this is a binop that binds at least as tightly as the current binop,
    // consume it, otherwise we are done.
    // Because the priority of illegal binary operator is - 1, if a unary expression is encountered, the function will return directly
    if (tok_prec < min_precedence)
      return lhs;

    // Okay, we know this is a binop.
    int binop = g_cur_token;
    NextToken(); // eat binop

    // Parse the primary expression after the binary operator.
    // Parse parenthesis expression
    auto rhs = ParsePrimary();
    if (!rhs)
      return nullptr;

    // If BinOp binds less tightly with RHS than the operator after RHS, let
    // the pending operator take RHS as its LHS.
    // There are two possible parsing methods
    //    (lhs binop rhs) binop unparsed
    //    lhs binop (rhs binop unparsed)
    int next_prec = GetTokenPrecedence();
    if (tok_prec < next_prec) {
      rhs = ParseBinOpRHS(tok_prec + 1, std::move(rhs));
      if (!rhs)
        return nullptr;
    }

    // Merge lhs/rhs.
    lhs =
        std::make_unique<BinaryExprAST>(binop, std::move(lhs), std::move(rhs));
  }
}

3.4 analyze the rest

Parsing function prototype:

/// prototype
///   ::= id '(' id* ')'
static std::unique_ptr<PrototypeAST> ParsePrototype() {
  if (g_cur_token != TOKEN_IDENTIFIER)
    return LogErrorP("Expected function name in prototype");

  std::string fn_name = g_identifier_str;
  NextToken();

  if (g_cur_token != '(')
    return LogErrorP("Expected '(' in prototype");

  // Read the list of argument names.
  std::vector<std::string> arg_names;
  while (NextToken() == TOKEN_IDENTIFIER)
    arg_names.push_back(g_dentifier_str);
  if (g_cur_token != ')')
    return LogErrorP("Expected ')' in prototype");

  // success.
  NextToken();  // eat ')'.

  return std::make_unique<PrototypeAST>(fn_name, std::move(arg_names));
}

Analytic function: def + function prototype + function body, which is an expression.

/// definition ::= 'def' prototype expression
static std::unique_ptr<FunctionAST> ParseDefinition() {
  NextToken();  // eat def.
  auto proto = ParsePrototype();
  if (!proto) return nullptr;

  if (auto expr = ParseExpression())
    return std::make_unique<FunctionAST>(std::move(proto), std::move(expr));
  return nullptr;
}

Parse extern expression:

/// external ::= 'extern' prototype
static std::unique_ptr<PrototypeAST> ParseExtern() {
  NextToken();  // eat extern.
  return ParsePrototype();
}

Implement anonymous functions for top-level Code:

/// toplevelexpr ::= expression
static std::unique_ptr<FunctionAST> ParseTopLevelExpr() {
  if (auto expr = ParseExpression()) {
    // Make an anonymous proto.
    auto proto = std::make_unique<PrototypeAST>("", std::vector<std::string>());
    return std::make_unique<FunctionAST>(std::move(proto), std::move(expr));
  }
  return nullptr;
}

3.5 Driver

/// top ::= definition | external | expression | ';'
static void MainLoop() {
  while (1) {
    fprintf(stderr, "ready> ");
    switch (g_cur_token) {
    case TOKEN_EOF:
      return;
    case ';': // ignore top-level semicolons.
      NextToken();
      break;
    case TOKEN_DEF:
      HandleDefinition();
      break;
    case TOKEN_EXTERN:
      HandleExtern();
      break;
    default:
      HandleTopLevelExpression();
      break;
    }
  }
}

Compile run:

# Compile
$ clang++ -g -O3 toy.cpp `llvm-config --cxxflags`
# Run
$ ./a.out
ready> def foo(x y) x+foo(y, 4.0);
Parsed a function definition.
ready> def foo(x y) x+y y;
Parsed a function definition.
Parsed a top-level expr
ready> def foo(x y) x+y );
Parsed a function definition.
Error: unknown token when expecting an expression
ready> extern sin(a);
ready> Parsed an extern
ready> ^D

4 semantic analysis

Semantic analysis is to eliminate semantic ambiguity, such as:

  • Data types do not match. Do you want to do automatic conversion? Since the data types involved in this paper are all float64 types, data type derivation and conversion are not involved.
  • If there are variables with the same name inside and outside a code block, which one should be used?
  • Two variables with the same name are not allowed in the same scope.

Semantic analysis will also generate some attribute information and mark it on AST.

reference resources

https://llvm.org/docs/tutorial/index.html

Palace literature The beauty principle of compilation

Topics: llvm