2008年11月30日日曜日

GCC 4 のデータ

GCC 4 について学ぶ

GCC は 30 のアーキテクチャに対応し 1500 万行もあるらしい。

GCC Internals, D. Novillo, 2007 International Symposium on Code Generation and Optimization (CGO), San Jose, California, March 2007.

diego novillo は 220 万 LOC だと書いている (SLOCCount というツールを使ったらしい)

どういう勘定をしたのだろうか。いくらなんでも 1500 万 LOC は多すぎる気がする。

最新の GCC 4.4-2008128 で数えてみる。
~/work/gcc/gcc-4.4-20081128$ find . -print0 | xargs -0 wc -l
668335 合計

まぁ、こんなものだと思う。snapshot 版だから中身が少ないのかもしれないけど。

試しに 4.3.2.tar.bz2 でもやってみた。
~/work/gcc/gcc-4.3.2$ find . -print0 | xargs -0 wc -l
235370 合計

う~ん。

2008年11月28日金曜日

コード挿入

デバッグやプロファイルやセキュリティチェックなどを目的として、任意のコードを仕込みたいという需要は多いと思う。

やり方としては、GIMPLE レベルで仕込む方法と、MD レベルで仕込む方法の、主に 2 種類ぐらいがあるようだ。

GIMPLE レベルで何かを仕込むのは、rtx に対する知識が必要だし、最適化の過程でコードの順番などが入れ替わる可能性もある。

MD レベルの方が、最適化の後だし、基本的には GCC のインラインアセンブラのような感じに記述できるので、比較的容易な気がする。

GCC の SSP (Stack Smashing Protector) は、セキュリティ用のコードを MD ファイルで仕込んでいるらしい。

ちょっと gcc-4.3.2/gcc/config/i386/i386.md を見てみる。
; SSP patterns
(UNSPEC_SP_SET 100)
(UNSPEC_SP_TEST 101)
(UNSPEC_SP_TLS_SET 102)
(UNSPEC_SP_TLS_TEST 103)
...

(define_expand "stack_protect_set"
[(match_operand 0 "memory_operand" "")
(match_operand 1 "memory_operand" "")]
""
{
#ifdef TARGET_THREAD_SSP_OFFSET
if (TARGET_64BIT)
emit_insn (gen_stack_tls_protect_set_di (operands[0],
GEN_INT (TARGET_THREAD_SSP_OFFSET)));
else
emit_insn (gen_stack_tls_protect_set_si (operands[0],
GEN_INT (TARGET_THREAD_SSP_OFFSET)));
#else
if (TARGET_64BIT)
emit_insn (gen_stack_protect_set_di (operands[0], operands[1]));
else
emit_insn (gen_stack_protect_set_si (operands[0], operands[1]));
#endif
DONE;
})

(define_insn "stack_protect_set_si"
[(set (match_operand:SI 0 "memory_operand" "=m")
(unspec:SI [(match_operand:SI 1 "memory_operand" "m")] UNSPEC_SP_SET))
(set (match_scratch:SI 2 "=&r") (const_int 0))
(clobber (reg:CC FLAGS_REG))]
""
"mov{l}\t{%1, %2|%2, %1}\;mov{l}\t{%2, %0|%0, %2}\;xor{l}\t%2, %2"
[(set_attr "type" "multi")])

(define_insn "stack_protect_set_di"
[(set (match_operand:DI 0 "memory_operand" "=m")
(unspec:DI [(match_operand:DI 1 "memory_operand" "m")] UNSPEC_SP_SET))
(set (match_scratch:DI 2 "=&r") (const_int 0))
(clobber (reg:CC FLAGS_REG))]
"TARGET_64BIT"
"mov{q}\t{%1, %2|%2, %1}\;mov{q}\t{%2, %0|%0, %2}\;xor{l}\t%k2, %k2"
[(set_attr "type" "multi")])

(define_insn "stack_tls_protect_set_si"
[(set (match_operand:SI 0 "memory_operand" "=m")
(unspec:SI [(match_operand:SI 1 "const_int_operand" "i")] UNSPEC_SP_TLS_SET))
(set (match_scratch:SI 2 "=&r") (const_int 0))
(clobber (reg:CC FLAGS_REG))]
""
"mov{l}\t{%%gs:%P1, %2|%2, DWORD PTR gs:%P1}\;mov{l}\t{%2, %0|%0, %2}\;xor{l}\t%2, %2"
[(set_attr "type" "multi")])

(define_insn "stack_tls_protect_set_di"
[(set (match_operand:DI 0 "memory_operand" "=m")
(unspec:DI [(match_operand:DI 1 "const_int_operand" "i")] UNSPEC_SP_TLS_SET))
(set (match_scratch:DI 2 "=&r") (const_int 0))
(clobber (reg:CC FLAGS_REG))]
"TARGET_64BIT"
{
/* The kernel uses a different segment register for performance reasons; a
system call would not have to trash the userspace segment register,
which would be expensive */
if (ix86_cmodel != CM_KERNEL)
return "mov{q}\t{%%fs:%P1, %2|%2, QWORD PTR fs:%P1}\;mov{q}\t{%2, %0|%0, %2}\;xor{l}\t%k2, %k2";
else
return "mov{q}\t{%%gs:%P1, %2|%2, QWORD PTR gs:%P1}\;mov{q}\t{%2, %0|%0, %2}\;xor{l}\t%k2, %k2";
}
[(set_attr "type" "multi")])

(define_expand "stack_protect_test"
[(match_operand 0 "memory_operand" "")
(match_operand 1 "memory_operand" "")
(match_operand 2 "" "")]
""
{
rtx flags = gen_rtx_REG (CCZmode, FLAGS_REG);
ix86_compare_op0 = operands[0];
ix86_compare_op1 = operands[1];
ix86_compare_emitted = flags;

#ifdef TARGET_THREAD_SSP_OFFSET
if (TARGET_64BIT)
emit_insn (gen_stack_tls_protect_test_di (flags, operands[0],
GEN_INT (TARGET_THREAD_SSP_OFFSET)));
else
emit_insn (gen_stack_tls_protect_test_si (flags, operands[0],
GEN_INT (TARGET_THREAD_SSP_OFFSET)));
#else
if (TARGET_64BIT)
emit_insn (gen_stack_protect_test_di (flags, operands[0], operands[1]));
else
emit_insn (gen_stack_protect_test_si (flags, operands[0], operands[1]));
#endif
emit_jump_insn (gen_beq (operands[2]));
DONE;
})

(define_insn "stack_protect_test_si"
[(set (match_operand:CCZ 0 "flags_reg_operand" "")
(unspec:CCZ [(match_operand:SI 1 "memory_operand" "m")
(match_operand:SI 2 "memory_operand" "m")]
UNSPEC_SP_TEST))
(clobber (match_scratch:SI 3 "=&r"))]
""
"mov{l}\t{%1, %3|%3, %1}\;xor{l}\t{%2, %3|%3, %2}"
[(set_attr "type" "multi")])

(define_insn "stack_protect_test_di"
[(set (match_operand:CCZ 0 "flags_reg_operand" "")
(unspec:CCZ [(match_operand:DI 1 "memory_operand" "m")
(match_operand:DI 2 "memory_operand" "m")]
UNSPEC_SP_TEST))
(clobber (match_scratch:DI 3 "=&r"))]
"TARGET_64BIT"
"mov{q}\t{%1, %3|%3, %1}\;xor{q}\t{%2, %3|%3, %2}"
[(set_attr "type" "multi")])

(define_insn "stack_tls_protect_test_si"
[(set (match_operand:CCZ 0 "flags_reg_operand" "")
(unspec:CCZ [(match_operand:SI 1 "memory_operand" "m")
(match_operand:SI 2 "const_int_operand" "i")]
UNSPEC_SP_TLS_TEST))
(clobber (match_scratch:SI 3 "=r"))]
""
"mov{l}\t{%1, %3|%3, %1}\;xor{l}\t{%%gs:%P2, %3|%3, DWORD PTR gs:%P2}"
[(set_attr "type" "multi")])

(define_insn "stack_tls_protect_test_di"
[(set (match_operand:CCZ 0 "flags_reg_operand" "")
(unspec:CCZ [(match_operand:DI 1 "memory_operand" "m")
(match_operand:DI 2 "const_int_operand" "i")]
UNSPEC_SP_TLS_TEST))
(clobber (match_scratch:DI 3 "=r"))]
"TARGET_64BIT"
{
/* The kernel uses a different segment register for performance reasons; a
system call would not have to trash the userspace segment register,
which would be expensive */
if (ix86_cmodel != CM_KERNEL)
return "mov{q}\t{%1, %3|%3, %1}\;xor{q}\t{%%fs:%P2, %3|%3, QWORD PTR fs:%P2}";
else
return "mov{q}\t{%1, %3|%3, %1}\;xor{q}\t{%%gs:%P2, %3|%3, QWORD PTR gs:%P2}";
}
[(set_attr "type" "multi")])

抽象構文

具象構文 (concrete syntax) というのは、プログラムのテキスト表現。要するに文字列。

コンパイラの中では、データ構造(内部表現)に変換される。
データ構造は文字列ではないんだけど、一対一に具象構文と対応するので、抽象構文表現 (abstract syntax representation) とも呼ばれる。
構文は木構造(ツリー)になるので、抽象構文木と言われることが多い。

プログラム ⇔ GENERIC (プログラムの抽象構文木)
アセンブリ ⇔ RTL IR (アセンブリの抽象構文木)

という対応関係がある。コンパイラにとっては、具象表現は単なる入出力フォーマットに過ぎない。
GCC のフレームワークは、対応している全ての言語のセマンティクスを表現できる。
その意味で、構文は GCC にとっては瑣末な問題となる。
どの言語で書かれていても、同じセマンティクスに対応するシンタクスからは、同じコードが出る。
セマンティクスの表現が内部表現そのものであると言える。

最適化は GIMPLE という単純な表現に変換してから行われる。

フロントエンド : GENERIC
ミドルエンド  : GIMPLE
バックエンド  : IR RTL

この構造は GCC 4.0 からで、さらに柔軟な最適化フレームワークを求めていた Redhat によって達成された。
それまでは、個別のフロントエンドで RTL を出すところまでを対応していた。言語ごとの共通表現は RTL しか無かった。
最適化も、RTL のレベルの局所的なものだけが共通化されていた。
現在の GCC では、ベクトル化や OpenMP なども、共通のミドルエンドルーチンで最適化される。

GCC の内部で、GENERICS は tree と呼ばれるデータ構造で扱われる。
IR RTL は rtx と呼ばれるデータ構造で扱われる。

RTL にも二種類があって、MD RTL と IR RTL がある。

MD は Machine Description (マシン定義)
IR は Intermediate Representation (中間表現)

MD RTL は、パターンマッチングルールの記述で、データ構造ではなく、直接 C コードに変換され、GCC 本体にコンパイル・リンクされる。
中間表現が IR RTL で記述されるのだから、変換ルールも RTL で記述すると、パターンマッチングが簡単に書ける。
MD RTL は、巨大な switch case 文も展開される。2 万行ぐらいの i386.md を変換すると、11 万行ぐらいの C プログラムになる。
./configure --targe=... で指定されるとき、どの md ファイルが組み込まれるのかが決定する。

2008年11月27日木曜日

MD ファイルからアセンブリが作られるまで

最後のアセンブリをファイルに fprintf (のようなこと。実際はフォーマット文字列を自前で解釈して putc(c, asm_out_file) とかしている) するところ。
  1. FINAL.C      :  final_scan_insn()
  2. FINAL.C      :   output_asm_insn()

gcc-4.3.2/gcc/final.c

/* Find the proper template for this insn. */
template = get_insn_template (insn_code_number, insn);


/* Output assembler code from the template. */
output_asm_insn (template, recog_data.operand);

template は const char * で、fprintf の format のような感じの文字列 (例 : "lea{l}\t{%a1, %0|%0, %a1}\0")

この template は、insn_data[] というテーブルを引いてるだけ。

const char *
get_insn_template (int code, rtx insn)
{
switch (insn_data[code].output_format)
{
case INSN_OUTPUT_FORMAT_SINGLE:
return insn_data[code].output.single;
case INSN_OUTPUT_FORMAT_MULTI:
return insn_data[code].output.multi[which_alternative];
case INSN_OUTPUT_FORMAT_FUNCTION:
gcc_assert (insn);
return (*insn_data[code].output.function) (recog_data.operand, insn);

default:
gcc_unreachable ();
}
}

insn_data[] は、insn-output.c の中で定数として持たれている。

const struct insn_data insn_data[] =
{
/* ../../gcc-4.3.2/gcc/config/i386/i386.md:694 */
{
"*cmpsi_ccno_1",
#if HAVE_DESIGNATED_INITIALIZERS
{ .multi = output_0 },
#else
{ 0, output_0, 0 },
#endif
0,
&operand_data[1],
2,
0,
2,
2
},
/* ../../gcc-4.3.2/gcc/config/i386/i386.md:706 */
{
"*cmpsi_minus_1",
#if HAVE_DESIGNATED_INITIALIZERS
{ .single =
#else
{
#endif
"cmp{l}\t{%1, %0|%0, %1}",
#if HAVE_DESIGNATED_INITIALIZERS
},
#else
0, 0 },
#endif
0,
&operand_data[3],
2,
0,
2,
1
},
...

さっきデバッガで見てたテンプレート "lea{l}\t{%a1, %0|%0, %a1}" はここか。

/* ../../gcc-4.3.2/gcc/config/i386/i386.md:5571 */
{
"*lea_1",
#if HAVE_DESIGNATED_INITIALIZERS
{ .single =
#else
{
#endif
"lea{l}\t{%a1, %0|%0, %a1}",
#if HAVE_DESIGNATED_INITIALIZERS
},
#else
0, 0 },
#endif
0,
&operand_data[359],
2,
0,
1,
1
},

そしてこの insn_data[] は、i386.md などの MD ファイルから genoutput で生成されている。

(define_insn "*cmpsi_ccno_1"
[(set (reg FLAGS_REG)
(compare (match_operand:SI 0 "nonimmediate_operand" "r,?mr")
(match_operand:SI 1 "const0_operand" "n,n")))]
"ix86_match_ccmode (insn, CCNOmode)"
"@
test{l}\t%0, %0
cmp{l}\t{%1, %0|%0, %1}"
[(set_attr "type" "test,icmp")
(set_attr "length_immediate" "0,1")
(set_attr "mode" "SI")])

(define_insn "*cmpsi_minus_1"
[(set (reg FLAGS_REG)
(compare (minus:SI (match_operand:SI 0 "nonimmediate_operand" "rm,r")
(match_operand:SI 1 "general_operand" "ri,mr"))
(const_int 0)))]
"ix86_match_ccmode (insn, CCGOCmode)"
"cmp{l}\t{%1, %0|%0, %1}"
[(set_attr "type" "icmp")
(set_attr "mode" "SI")])

...

(define_insn "*lea_1"
[(set (match_operand:SI 0 "register_operand" "=r")
(match_operand:SI 1 "no_seg_address_operand" "p"))]
"!TARGET_64BIT"
"lea{l}\t{%a1, %0|%0, %a1}"
[(set_attr "type" "lea")
(set_attr "mode" "SI")])

Machine Description の資料

GCC Internals - Machine Desc

Writing GCC Machine Descriptions / Systematic Development of GCC MD

関数呼び出しを埋め込む

GCC は -finstrument-functions で任意のプロファイル関数を関数の enter/return 場所に仕込むことができる。

Graphvizによるファンクション・コールの視覚化

ちなみに、このプロファイル関数 (__cyg_profile_function_enter/exit()) は決め打ちで埋め込まれるので、C++ の場合は extern "C" で宣言しなければならない。

How to use '-finstrument-functions' in C++ programs ?

この機能の実装は、単に関数呼び出しの RTX を直接展開することで行われているようだ。

[tree-ssa][rfc] -finstrument-functions at tree level

gcc-4.3.2/gcc/builtins.c

static rtx
expand_builtin_profile_func (bool exitp)
{
rtx this, which;

this = DECL_RTL (current_function_decl);
gcc_assert (MEM_P (this));
this = XEXP (this, 0);

if (exitp)
which = profile_function_exit_libfunc;
else
which = profile_function_entry_libfunc;

emit_library_call (which, LCT_NORMAL, VOIDmode, 2, this, Pmode,
expand_builtin_return_addr (BUILT_IN_RETURN_ADDRESS,
0),
Pmode);

return const0_rtx;
}

gcc-4.3.2/gcc/libfuncs.h

#define profile_function_exit_libfunc (libfunc_table[LTI_profile_function_exit])

gcc-4.3.2/gcc/optabs.c

rtx libfunc_table[LTI_MAX];


rtx
init_one_libfunc (const char *name)
{
rtx symbol;

/* Create a FUNCTION_DECL that can be passed to
targetm.encode_section_info. */
/* ??? We don't have any type information except for this is
a function. Pretend this is "int foo()". */
tree decl = build_decl (FUNCTION_DECL, get_identifier (name),
build_function_type (integer_type_node, NULL_TREE));
DECL_ARTIFICIAL (decl) = 1;
DECL_EXTERNAL (decl) = 1;
TREE_PUBLIC (decl) = 1;

symbol = XEXP (DECL_RTL (decl), 0);

/* Zap the nonsensical SYMBOL_REF_DECL for this. What we're left with
are the flags assigned by targetm.encode_section_info. */
SET_SYMBOL_REF_DECL (symbol, 0);

return symbol;
}

単に external でかつ artificial な関数呼び出しを作っているようだ。

abort_libfunc = init_one_libfunc ("abort");
memcpy_libfunc = init_one_libfunc ("memcpy");
memmove_libfunc = init_one_libfunc ("memmove");
memcmp_libfunc = init_one_libfunc ("memcmp");
memset_libfunc = init_one_libfunc ("memset");
setbits_libfunc = init_one_libfunc ("__setbits");

#ifndef DONT_USE_BUILTIN_SETJMP
setjmp_libfunc = init_one_libfunc ("__builtin_setjmp");
longjmp_libfunc = init_one_libfunc ("__builtin_longjmp");
#else
setjmp_libfunc = init_one_libfunc ("setjmp");
longjmp_libfunc = init_one_libfunc ("longjmp");
#endif
unwind_sjlj_register_libfunc = init_one_libfunc ("_Unwind_SjLj_Register");
unwind_sjlj_unregister_libfunc
= init_one_libfunc ("_Unwind_SjLj_Unregister");

/* For function entry/exit instrumentation. */
profile_function_entry_libfunc
= init_one_libfunc ("__cyg_profile_func_enter");
profile_function_exit_libfunc
= init_one_libfunc ("__cyg_profile_func_exit");

gcov_flush_libfunc = init_one_libfunc ("__gcov_flush");

gcc-4.3.2/gcc/calls.h

/* Output a library call to function FUN (a SYMBOL_REF rtx)
(emitting the queue unless NO_QUEUE is nonzero),
for a value of mode OUTMODE,
with NARGS different arguments, passed as alternating rtx values
and machine_modes to convert them to.

FN_TYPE should be LCT_NORMAL for `normal' calls, LCT_CONST for `const'
calls, LCT_PURE for `pure' calls, LCT_CONST_MAKE_BLOCK for `const' calls
which should be enclosed in REG_LIBCALL/REG_RETVAL notes,
LCT_PURE_MAKE_BLOCK for `purep' calls which should be enclosed in
REG_LIBCALL/REG_RETVAL notes with extra (use (memory (scratch)),
or other LCT_ value for other types of library calls. */

void
emit_library_call (rtx orgfun, enum libcall_type fn_type,
enum machine_mode outmode, int nargs, ...)
{
va_list p;

va_start (p, nargs);
emit_library_call_value_1 (0, orgfun, NULL_RTX, fn_type, outmode, nargs, p);
va_end (p);
}

同ファイル内の emit_library_call_value_1 は非常に巨大。

/* Output a library call to function FUN (a SYMBOL_REF rtx).
The RETVAL parameter specifies whether return value needs to be saved, other
parameters are documented in the emit_library_call function below. */

static rtx
emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
enum libcall_type fn_type,
enum machine_mode outmode, int nargs, va_list p)
{
/* Total size in bytes of all the stack-parms scanned so far. */
struct args_size args_size;
/* Size of arguments before any adjustments (such as rounding). */
struct args_size original_args_size;
int argnum;
rtx fun;
int inc;
int count;
rtx argblock = 0;
CUMULATIVE_ARGS args_so_far;
struct arg
{
rtx value;
enum machine_mode mode;
rtx reg;
int partial;
struct locate_and_pad_arg_data locate;
rtx save_area;
};
struct arg *argvec;
int old_inhibit_defer_pop = inhibit_defer_pop;
rtx call_fusage = 0;
rtx mem_value = 0;
rtx valreg;
int pcc_struct_value = 0;
int struct_value_size = 0;
int flags;
int reg_parm_stack_space = 0;
int needed;
rtx before_call;
tree tfom; /* type_for_mode (outmode, 0) */

#ifdef REG_PARM_STACK_SPACE
/* Define the boundary of the register parm stack space that needs to be
save, if any. */
int low_to_save, high_to_save;
rtx save_area = 0; /* Place that it is saved. */
#endif

/* Size of the stack reserved for parameter registers. */
int initial_highest_arg_in_use = highest_outgoing_arg_in_use;
char *initial_stack_usage_map = stack_usage_map;
char *stack_usage_map_buf = NULL;

rtx struct_value = targetm.calls.struct_value_rtx (0, 0);

#ifdef REG_PARM_STACK_SPACE
reg_parm_stack_space = REG_PARM_STACK_SPACE ((tree) 0);
#endif

/* By default, library functions can not throw. */
flags = ECF_NOTHROW;

switch (fn_type)
{
case LCT_NORMAL:
break;
case LCT_CONST:
flags |= ECF_CONST;
break;
case LCT_PURE:
flags |= ECF_PURE;
break;
case LCT_CONST_MAKE_BLOCK:
flags |= ECF_CONST | ECF_LIBCALL_BLOCK;
break;
case LCT_PURE_MAKE_BLOCK:
flags |= ECF_PURE | ECF_LIBCALL_BLOCK;
break;
case LCT_NORETURN:
flags |= ECF_NORETURN;
break;
case LCT_THROW:
flags = ECF_NORETURN;
break;
case LCT_RETURNS_TWICE:
flags = ECF_RETURNS_TWICE;
break;
}
fun = orgfun;

/* Ensure current function's preferred stack boundary is at least
what we need. */
if (cfun->preferred_stack_boundary < PREFERRED_STACK_BOUNDARY)
cfun->preferred_stack_boundary = PREFERRED_STACK_BOUNDARY;

/* If this kind of value comes back in memory,
decide where in memory it should come back. */
if (outmode != VOIDmode)
{
tfom = lang_hooks.types.type_for_mode (outmode, 0);
if (aggregate_value_p (tfom, 0))
{
#ifdef PCC_STATIC_STRUCT_RETURN
rtx pointer_reg
= hard_function_value (build_pointer_type (tfom), 0, 0, 0);
mem_value = gen_rtx_MEM (outmode, pointer_reg);
pcc_struct_value = 1;
if (value == 0)
value = gen_reg_rtx (outmode);
#else /* not PCC_STATIC_STRUCT_RETURN */
struct_value_size = GET_MODE_SIZE (outmode);
if (value != 0 && MEM_P (value))
mem_value = value;
else
mem_value = assign_temp (tfom, 0, 1, 1);
#endif
/* This call returns a big structure. */
flags &= ~(ECF_CONST | ECF_PURE | ECF_LIBCALL_BLOCK);
}
}
else
tfom = void_type_node;

/* ??? Unfinished: must pass the memory address as an argument. */

/* Copy all the libcall-arguments out of the varargs data
and into a vector ARGVEC.

Compute how to pass each argument. We only support a very small subset
of the full argument passing conventions to limit complexity here since
library functions shouldn't have many args. */

argvec = alloca ((nargs + 1) * sizeof (struct arg));
memset (argvec, 0, (nargs + 1) * sizeof (struct arg));

#ifdef INIT_CUMULATIVE_LIBCALL_ARGS
INIT_CUMULATIVE_LIBCALL_ARGS (args_so_far, outmode, fun);
#else
INIT_CUMULATIVE_ARGS (args_so_far, NULL_TREE, fun, 0, nargs);
#endif

args_size.constant = 0;
args_size.var = 0;

count = 0;

/* Now we are about to start emitting insns that can be deleted
if a libcall is deleted. */
if (flags & ECF_LIBCALL_BLOCK)
start_sequence ();

push_temp_slots ();

/* If there's a structure value address to be passed,
either pass it in the special place, or pass it as an extra argument. */
if (mem_value && struct_value == 0 && ! pcc_struct_value)
{
rtx addr = XEXP (mem_value, 0);

nargs++;

/* Make sure it is a reasonable operand for a move or push insn. */
if (!REG_P (addr) && !MEM_P (addr)
&& ! (CONSTANT_P (addr) && LEGITIMATE_CONSTANT_P (addr)))
addr = force_operand (addr, NULL_RTX);

argvec[count].value = addr;
argvec[count].mode = Pmode;
argvec[count].partial = 0;

argvec[count].reg = FUNCTION_ARG (args_so_far, Pmode, NULL_TREE, 1);
gcc_assert (targetm.calls.arg_partial_bytes (&args_so_far, Pmode,
NULL_TREE, 1) == 0);

locate_and_pad_parm (Pmode, NULL_TREE,
#ifdef STACK_PARMS_IN_REG_PARM_AREA
1,
#else
argvec[count].reg != 0,
#endif
0, NULL_TREE, &args_size, &argvec[count].locate);

if (argvec[count].reg == 0 || argvec[count].partial != 0
|| reg_parm_stack_space > 0)
args_size.constant += argvec[count].locate.size.constant;

FUNCTION_ARG_ADVANCE (args_so_far, Pmode, (tree) 0, 1);

count++;
}

for (; count < nargs; count++)
{
rtx val = va_arg (p, rtx);
enum machine_mode mode = va_arg (p, enum machine_mode);

/* We cannot convert the arg value to the mode the library wants here;
must do it earlier where we know the signedness of the arg. */
gcc_assert (mode != BLKmode
&& (GET_MODE (val) == mode || GET_MODE (val) == VOIDmode));

/* Make sure it is a reasonable operand for a move or push insn. */
if (!REG_P (val) && !MEM_P (val)
&& ! (CONSTANT_P (val) && LEGITIMATE_CONSTANT_P (val)))
val = force_operand (val, NULL_RTX);

if (pass_by_reference (&args_so_far, mode, NULL_TREE, 1))
{
rtx slot;
int must_copy
= !reference_callee_copied (&args_so_far, mode, NULL_TREE, 1);

/* loop.c won't look at CALL_INSN_FUNCTION_USAGE of const/pure
functions, so we have to pretend this isn't such a function. */
if (flags & ECF_LIBCALL_BLOCK)
{
rtx insns = get_insns ();
end_sequence ();
emit_insn (insns);
}
flags &= ~(ECF_CONST | ECF_PURE | ECF_LIBCALL_BLOCK);

/* If this was a CONST function, it is now PURE since
it now reads memory. */
if (flags & ECF_CONST)
{
flags &= ~ECF_CONST;
flags |= ECF_PURE;
}

if (GET_MODE (val) == MEM && !must_copy)
slot = val;
else
{
slot = assign_temp (lang_hooks.types.type_for_mode (mode, 0),
0, 1, 1);
emit_move_insn (slot, val);
}

call_fusage = gen_rtx_EXPR_LIST (VOIDmode,
gen_rtx_USE (VOIDmode, slot),
call_fusage);
if (must_copy)
call_fusage = gen_rtx_EXPR_LIST (VOIDmode,
gen_rtx_CLOBBER (VOIDmode,
slot),
call_fusage);

mode = Pmode;
val = force_operand (XEXP (slot, 0), NULL_RTX);
}

argvec[count].value = val;
argvec[count].mode = mode;

argvec[count].reg = FUNCTION_ARG (args_so_far, mode, NULL_TREE, 1);

argvec[count].partial
= targetm.calls.arg_partial_bytes (&args_so_far, mode, NULL_TREE, 1);

locate_and_pad_parm (mode, NULL_TREE,
#ifdef STACK_PARMS_IN_REG_PARM_AREA
1,
#else
argvec[count].reg != 0,
#endif
argvec[count].partial,
NULL_TREE, &args_size, &argvec[count].locate);

gcc_assert (!argvec[count].locate.size.var);

if (argvec[count].reg == 0 || argvec[count].partial != 0
|| reg_parm_stack_space > 0)
args_size.constant += argvec[count].locate.size.constant;

FUNCTION_ARG_ADVANCE (args_so_far, mode, (tree) 0, 1);
}

/* If this machine requires an external definition for library
functions, write one out. */
assemble_external_libcall (fun);

original_args_size = args_size;
args_size.constant = (((args_size.constant
+ stack_pointer_delta
+ STACK_BYTES - 1)
/ STACK_BYTES
* STACK_BYTES)
- stack_pointer_delta);

args_size.constant = MAX (args_size.constant,
reg_parm_stack_space);

if (!OUTGOING_REG_PARM_STACK_SPACE)
args_size.constant -= reg_parm_stack_space;

if (args_size.constant > current_function_outgoing_args_size)
current_function_outgoing_args_size = args_size.constant;

if (ACCUMULATE_OUTGOING_ARGS)
{
/* Since the stack pointer will never be pushed, it is possible for
the evaluation of a parm to clobber something we have already
written to the stack. Since most function calls on RISC machines
do not use the stack, this is uncommon, but must work correctly.

Therefore, we save any area of the stack that was already written
and that we are using. Here we set up to do this by making a new
stack usage map from the old one.

Another approach might be to try to reorder the argument
evaluations to avoid this conflicting stack usage. */

needed = args_size.constant;

/* Since we will be writing into the entire argument area, the
map must be allocated for its entire size, not just the part that
is the responsibility of the caller. */
if (!OUTGOING_REG_PARM_STACK_SPACE)
needed += reg_parm_stack_space;

#ifdef ARGS_GROW_DOWNWARD
highest_outgoing_arg_in_use = MAX (initial_highest_arg_in_use,
needed + 1);
#else
highest_outgoing_arg_in_use = MAX (initial_highest_arg_in_use,
needed);
#endif
stack_usage_map_buf = XNEWVEC (char, highest_outgoing_arg_in_use);
stack_usage_map = stack_usage_map_buf;

if (initial_highest_arg_in_use)
memcpy (stack_usage_map, initial_stack_usage_map,
initial_highest_arg_in_use);

if (initial_highest_arg_in_use != highest_outgoing_arg_in_use)
memset (&stack_usage_map[initial_highest_arg_in_use], 0,
highest_outgoing_arg_in_use - initial_highest_arg_in_use);
needed = 0;

/* We must be careful to use virtual regs before they're instantiated,
and real regs afterwards. Loop optimization, for example, can create
new libcalls after we've instantiated the virtual regs, and if we
use virtuals anyway, they won't match the rtl patterns. */

if (virtuals_instantiated)
argblock = plus_constant (stack_pointer_rtx, STACK_POINTER_OFFSET);
else
argblock = virtual_outgoing_args_rtx;
}
else
{
if (!PUSH_ARGS)
argblock = push_block (GEN_INT (args_size.constant), 0, 0);
}

/* If we push args individually in reverse order, perform stack alignment
before the first push (the last arg). */
if (argblock == 0 && PUSH_ARGS_REVERSED)
anti_adjust_stack (GEN_INT (args_size.constant
- original_args_size.constant));

if (PUSH_ARGS_REVERSED)
{
inc = -1;
argnum = nargs - 1;
}
else
{
inc = 1;
argnum = 0;
}

#ifdef REG_PARM_STACK_SPACE
if (ACCUMULATE_OUTGOING_ARGS)
{
/* The argument list is the property of the called routine and it
may clobber it. If the fixed area has been used for previous
parameters, we must save and restore it. */
save_area = save_fixed_argument_area (reg_parm_stack_space, argblock,
&low_to_save, &high_to_save);
}
#endif

/* Push the args that need to be pushed. */

/* ARGNUM indexes the ARGVEC array in the order in which the arguments
are to be pushed. */
for (count = 0; count < nargs; count++, argnum += inc)
{
enum machine_mode mode = argvec[argnum].mode;
rtx val = argvec[argnum].value;
rtx reg = argvec[argnum].reg;
int partial = argvec[argnum].partial;
int lower_bound = 0, upper_bound = 0, i;

if (! (reg != 0 && partial == 0))
{
if (ACCUMULATE_OUTGOING_ARGS)
{
/* If this is being stored into a pre-allocated, fixed-size,
stack area, save any previous data at that location. */

#ifdef ARGS_GROW_DOWNWARD
/* stack_slot is negative, but we want to index stack_usage_map
with positive values. */
upper_bound = -argvec[argnum].locate.offset.constant + 1;
lower_bound = upper_bound - argvec[argnum].locate.size.constant;
#else
lower_bound = argvec[argnum].locate.offset.constant;
upper_bound = lower_bound + argvec[argnum].locate.size.constant;
#endif

i = lower_bound;
/* Don't worry about things in the fixed argument area;
it has already been saved. */
if (i < reg_parm_stack_space)
i = reg_parm_stack_space;
while (i < upper_bound && stack_usage_map[i] == 0)
i++;

if (i < upper_bound)
{
/* We need to make a save area. */
unsigned int size
= argvec[argnum].locate.size.constant * BITS_PER_UNIT;
enum machine_mode save_mode
= mode_for_size (size, MODE_INT, 1);
rtx adr
= plus_constant (argblock,
argvec[argnum].locate.offset.constant);
rtx stack_area
= gen_rtx_MEM (save_mode, memory_address (save_mode, adr));

if (save_mode == BLKmode)
{
argvec[argnum].save_area
= assign_stack_temp (BLKmode,
argvec[argnum].locate.size.constant,
0);

emit_block_move (validize_mem (argvec[argnum].save_area),
stack_area,
GEN_INT (argvec[argnum].locate.size.constant),
BLOCK_OP_CALL_PARM);
}
else
{
argvec[argnum].save_area = gen_reg_rtx (save_mode);

emit_move_insn (argvec[argnum].save_area, stack_area);
}
}
}

emit_push_insn (val, mode, NULL_TREE, NULL_RTX, PARM_BOUNDARY,
partial, reg, 0, argblock,
GEN_INT (argvec[argnum].locate.offset.constant),
reg_parm_stack_space,
ARGS_SIZE_RTX (argvec[argnum].locate.alignment_pad));

/* Now mark the segment we just used. */
if (ACCUMULATE_OUTGOING_ARGS)
for (i = lower_bound; i < upper_bound; i++)
stack_usage_map[i] = 1;

NO_DEFER_POP;

if (flags & ECF_CONST)
{
rtx use;

/* Indicate argument access so that alias.c knows that these
values are live. */
if (argblock)
use = plus_constant (argblock,
argvec[argnum].locate.offset.constant);
else
/* When arguments are pushed, trying to tell alias.c where
exactly this argument is won't work, because the
auto-increment causes confusion. So we merely indicate
that we access something with a known mode somewhere on
the stack. */
use = gen_rtx_PLUS (Pmode, virtual_outgoing_args_rtx,
gen_rtx_SCRATCH (Pmode));
use = gen_rtx_MEM (argvec[argnum].mode, use);
use = gen_rtx_USE (VOIDmode, use);
call_fusage = gen_rtx_EXPR_LIST (VOIDmode, use, call_fusage);
}
}
}

/* If we pushed args in forward order, perform stack alignment
after pushing the last arg. */
if (argblock == 0 && !PUSH_ARGS_REVERSED)
anti_adjust_stack (GEN_INT (args_size.constant
- original_args_size.constant));

if (PUSH_ARGS_REVERSED)
argnum = nargs - 1;
else
argnum = 0;

fun = prepare_call_address (fun, NULL, &call_fusage, 0, 0);

/* Now load any reg parms into their regs. */

/* ARGNUM indexes the ARGVEC array in the order in which the arguments
are to be pushed. */
for (count = 0; count < nargs; count++, argnum += inc)
{
enum machine_mode mode = argvec[argnum].mode;
rtx val = argvec[argnum].value;
rtx reg = argvec[argnum].reg;
int partial = argvec[argnum].partial;

/* Handle calls that pass values in multiple non-contiguous
locations. The PA64 has examples of this for library calls. */
if (reg != 0 && GET_CODE (reg) == PARALLEL)
emit_group_load (reg, val, NULL_TREE, GET_MODE_SIZE (mode));
else if (reg != 0 && partial == 0)
emit_move_insn (reg, val);

NO_DEFER_POP;
}

/* Any regs containing parms remain in use through the call. */
for (count = 0; count < nargs; count++)
{
rtx reg = argvec[count].reg;
if (reg != 0 && GET_CODE (reg) == PARALLEL)
use_group_regs (&call_fusage, reg);
else if (reg != 0)
{
int partial = argvec[count].partial;
if (partial)
{
int nregs;
gcc_assert (partial % UNITS_PER_WORD == 0);
nregs = partial / UNITS_PER_WORD;
use_regs (&call_fusage, REGNO (reg), nregs);
}
else
use_reg (&call_fusage, reg);
}
}

/* Pass the function the address in which to return a structure value. */
if (mem_value != 0 && struct_value != 0 && ! pcc_struct_value)
{
emit_move_insn (struct_value,
force_reg (Pmode,
force_operand (XEXP (mem_value, 0),
NULL_RTX)));
if (REG_P (struct_value))
use_reg (&call_fusage, struct_value);
}

/* Don't allow popping to be deferred, since then
cse'ing of library calls could delete a call and leave the pop. */
NO_DEFER_POP;
valreg = (mem_value == 0 && outmode != VOIDmode
? hard_libcall_value (outmode) : NULL_RTX);

/* Stack must be properly aligned now. */
gcc_assert (!(stack_pointer_delta
& (PREFERRED_STACK_BOUNDARY / BITS_PER_UNIT - 1)));

before_call = get_last_insn ();

/* We pass the old value of inhibit_defer_pop + 1 to emit_call_1, which
will set inhibit_defer_pop to that value. */
/* The return type is needed to decide how many bytes the function pops.
Signedness plays no role in that, so for simplicity, we pretend it's
always signed. We also assume that the list of arguments passed has
no impact, so we pretend it is unknown. */

emit_call_1 (fun, NULL,
get_identifier (XSTR (orgfun, 0)),
build_function_type (tfom, NULL_TREE),
original_args_size.constant, args_size.constant,
struct_value_size,
FUNCTION_ARG (args_so_far, VOIDmode, void_type_node, 1),
valreg,
old_inhibit_defer_pop + 1, call_fusage, flags, & args_so_far);

/* For calls to `setjmp', etc., inform function.c:setjmp_warnings
that it should complain if nonvolatile values are live. For
functions that cannot return, inform flow that control does not
fall through. */

if (flags & ECF_NORETURN)
{
/* The barrier note must be emitted
immediately after the CALL_INSN. Some ports emit more than
just a CALL_INSN above, so we must search for it here. */

rtx last = get_last_insn ();
while (!CALL_P (last))
{
last = PREV_INSN (last);
/* There was no CALL_INSN? */
gcc_assert (last != before_call);
}

emit_barrier_after (last);
}

/* Now restore inhibit_defer_pop to its actual original value. */
OK_DEFER_POP;

/* If call is cse'able, make appropriate pair of reg-notes around it.
Test valreg so we don't crash; may safely ignore `const'
if return type is void. Disable for PARALLEL return values, because
we have no way to move such values into a pseudo register. */
if (flags & ECF_LIBCALL_BLOCK)
{
rtx insns;

if (valreg == 0)
{
insns = get_insns ();
end_sequence ();
emit_insn (insns);
}
else
{
rtx note = 0;
rtx temp;
int i;

if (GET_CODE (valreg) == PARALLEL)
{
temp = gen_reg_rtx (outmode);
emit_group_store (temp, valreg, NULL_TREE,
GET_MODE_SIZE (outmode));
valreg = temp;
}

temp = gen_reg_rtx (GET_MODE (valreg));

/* Construct an "equal form" for the value which mentions all the
arguments in order as well as the function name. */
for (i = 0; i < nargs; i++)
note = gen_rtx_EXPR_LIST (VOIDmode, argvec[i].value, note);
note = gen_rtx_EXPR_LIST (VOIDmode, fun, note);

insns = get_insns ();
end_sequence ();

if (flags & ECF_PURE)
note = gen_rtx_EXPR_LIST (VOIDmode,
gen_rtx_USE (VOIDmode,
gen_rtx_MEM (BLKmode,
gen_rtx_SCRATCH (VOIDmode))),
note);

emit_libcall_block (insns, temp, valreg, note);

valreg = temp;
}
}
pop_temp_slots ();

/* Copy the value to the right place. */
if (outmode != VOIDmode && retval)
{
if (mem_value)
{
if (value == 0)
value = mem_value;
if (value != mem_value)
emit_move_insn (value, mem_value);
}
else if (GET_CODE (valreg) == PARALLEL)
{
if (value == 0)
value = gen_reg_rtx (outmode);
emit_group_store (value, valreg, NULL_TREE, GET_MODE_SIZE (outmode));
}
else
{
/* Convert to the proper mode if PROMOTE_MODE has been active. */
if (GET_MODE (valreg) != outmode)
{
int unsignedp = TYPE_UNSIGNED (tfom);

gcc_assert (targetm.calls.promote_function_return (tfom));
gcc_assert (promote_mode (tfom, outmode, &unsignedp, 0)
== GET_MODE (valreg));

valreg = convert_modes (outmode, GET_MODE (valreg), valreg, 0);
}

if (value != 0)
emit_move_insn (value, valreg);
else
value = valreg;
}
}

if (ACCUMULATE_OUTGOING_ARGS)
{
#ifdef REG_PARM_STACK_SPACE
if (save_area)
restore_fixed_argument_area (save_area, argblock,
high_to_save, low_to_save);
#endif

/* If we saved any argument areas, restore them. */
for (count = 0; count < nargs; count++)
if (argvec[count].save_area)
{
enum machine_mode save_mode = GET_MODE (argvec[count].save_area);
rtx adr = plus_constant (argblock,
argvec[count].locate.offset.constant);
rtx stack_area = gen_rtx_MEM (save_mode,
memory_address (save_mode, adr));

if (save_mode == BLKmode)
emit_block_move (stack_area,
validize_mem (argvec[count].save_area),
GEN_INT (argvec[count].locate.size.constant),
BLOCK_OP_CALL_PARM);
else
emit_move_insn (stack_area, argvec[count].save_area);
}

highest_outgoing_arg_in_use = initial_highest_arg_in_use;
stack_usage_map = initial_stack_usage_map;
}

if (stack_usage_map_buf)
free (stack_usage_map_buf);

return value;

}

2008年11月26日水曜日

最適化の対象となるデータ構造

tree-inline.c などで定義されている、プロシージャ間 (interprocedural)  にまたがる最適化パスは cgraph_*() という関数名が付いていることが多い。

tree-ssa-*.c や tree-vect*.c などで定義されている最適化パスは cfg_*() という関数名が付いていることが多い。

紛らわしいのでまとめてみる。

http://diablo.elis.ugent.be/obfuscation_orig

  • call graph (CG)
    手続きの呼び出し関係グラフ。CG のノードが CFG。
  • control flow graph (CFG)
    手続き内部のフローグラフ。CFC のノードが BB。
  • basic blocks (BB)
    基本処理単位

ミドルエンドからバックエンドまでのフレームワーク

http://www.airs.com/dnovillo/Papers/

GCC Internals, D. Novillo, 2007 International Symposium on Code Generation and Optimization (CGO), San Jose, California, March 2007.(slides).
  • GCC のミドルエンドとバックエンドはまとめて実行される。
ミドルエンドは、GIMPLE → GIMPLE  な最適化と GIMPLE → RTL の変換が混在する、最適化パスのリスト。
最適化パスは、関数ポインタ execute や実行条件 gate(-On オプションによって制御される最適化強度など)が含まれる構造体 (tree-pass.h)

/* Describe one pass. */
struct tree_opt_pass
{
/* Terse name of the pass used as a fragment of the dump file name. */
const char *name;

/* If non-null, this pass and all sub-passes are executed only if
the function returns true. */
bool (*gate) (void);

/* This is the code to run. If null, then there should be sub-passes
otherwise this pass does nothing. The return value contains
TODOs to execute in addition to those in TODO_flags_finish. */
unsigned int (*execute) (void);

/* A list of sub-passes to run, dependent on gate predicate. */
struct tree_opt_pass *sub;

/* Next in the list of passes to run, independent of gate predicate. */
struct tree_opt_pass *next;

/* Static pass number, used as a fragment of the dump file name. */
int static_pass_number;

/* The timevar id associated with this pass. */
/* ??? Ideally would be dynamically assigned. */
unsigned int tv_id;

/* Sets of properties input and output from this pass. */
unsigned int properties_required;
unsigned int properties_provided;
unsigned int properties_destroyed;

/* Flags indicating common sets things to do before and after. */
unsigned int todo_flags_start;
unsigned int todo_flags_finish;

/* Letter for RTL dumps. */
char letter;
};

によって表現され、execute_pass_list() によって実行される (passes.c)。

void
execute_pass_list (struct tree_opt_pass *pass)
{
do
{
if (execute_one_pass (pass) && pass->sub)
execute_pass_list (pass->sub);
pass = pass->next;
}
while (pass);
}

バックエンドは、RTR からのコード生成。
  • バックエンドも execute_pass_list() によって実行されるパスの一つ。
この点が GCC でもっともわかりにくいところ。
関数ポインタを含んだ構造体が使われまくるため静的な解析が難しい。
デバッガを使わなければ、どの関数が実行されるのかを突き止めるのはほぼ不可能。

パスリストの初期化は、同ファイル内の init_optimization_passed()

/* Construct the pass tree. The sequencing of passes is driven by
the cgraph routines:

cgraph_finalize_compilation_unit ()
for each node N in the cgraph
cgraph_analyze_function (N)
cgraph_lower_function (N) -> all_lowering_passes

If we are optimizing, cgraph_optimize is then invoked:

cgraph_optimize ()
ipa_passes () -> all_ipa_passes
cgraph_expand_all_functions ()
for each node N in the cgraph
cgraph_expand_function (N)
tree_rest_of_compilation (DECL (N)) -> all_passes
*/

void
init_optimization_passes (void)
{
struct tree_opt_pass **p;

#define NEXT_PASS(PASS) (p = next_pass_1 (p, &PASS))

/* All passes needed to lower the function into shape optimizers can
operate on. These passes are always run first on the function, but
backend might produce already lowered functions that are not processed
by these passes. */
p = &all_lowering_passes;
NEXT_PASS (pass_remove_useless_stmts);
NEXT_PASS (pass_mudflap_1);
NEXT_PASS (pass_lower_omp);
NEXT_PASS (pass_lower_cf);
NEXT_PASS (pass_refactor_eh);
NEXT_PASS (pass_lower_eh);
NEXT_PASS (pass_build_cfg);
NEXT_PASS (pass_lower_complex_O0);
NEXT_PASS (pass_lower_vector);
NEXT_PASS (pass_warn_function_return);
NEXT_PASS (pass_build_cgraph_edges);
NEXT_PASS (pass_inline_parameters);
*p = NULL;

...

NEXT_PASS (pass_final);
}
NEXT_PASS (pass_df_finish);
}
NEXT_PASS (pass_clean_state);
*p = NULL;

#undef NEXT_PASS

/* Register the passes with the tree dump code. */
register_dump_files (all_lowering_passes, false, PROP_gimple_any);
all_lowering_passes->todo_flags_start |= TODO_set_props;
register_dump_files (all_ipa_passes, true,
PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh
| PROP_cfg);
register_dump_files (all_passes, false,
PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh
| PROP_cfg);
}

一番最後に、前回触れた、RTL からアセンブリを生成する rest_of_handle_final() が含まれた構造体 pass_final が追加されていることがわかる。バックエンドも、パスの一つになっている。

pass_final は final.c において実体が定義されている。final は必ず実行されるパスなので、gate が NULL になる。最適化レベルによっては実行されないようなパスの場合は、gate() の中で optimize というグローバルなフラグに応じて実行するか否かを判断する。

struct tree_opt_pass pass_final =
{
NULL, /* name */
NULL, /* gate */
rest_of_handle_final, /* execute */
NULL, /* sub */
NULL, /* next */
0, /* static_pass_number */
TV_FINAL, /* tv_id */
0, /* properties_required */
0, /* properties_provided */
0, /* properties_destroyed */
0, /* todo_flags_start */
TODO_ggc_collect, /* todo_flags_finish */
0 /* letter */
};

ここまでに示した、ひたすらパスを実行するフレームワークが、GCC のミドルエンド以降の肝となる。これさえ理解すれば、後はひたすら個別のパスのコードを追っていけば、いつかは GCC をマスターできるはず。

アセンブリが出るまで

  1. MAIN.C      : main()
  2. TOPLEV.C    :  toplev_main()
  3. TOPLEV.C    :   do_compile()
  4. TOPLEV.C    :    compile_file()
  5. C-OPTS.C     :    c_common_parse_file()
  6. C-PARSER.C    :     c_parse_file()
  7. C-PARSER.C    :      c_parser_translation_unit()
  8. C-PARSER.C    :       c_parser_external_declaration()
  9. C-PARSER.C    :        c_parser_declaration_or_fndef()
  10. C-DECL.C     :         finish_function()
  11. CGRAPHUNIT.C  :          cgraph_finalize_function()
  12. CGRAPHUNIT.C  :           cgraph_assemble_pending_functions()
  13. CGRAPHUNIT.C  :           cgraph_expand_function()
  14. TREE-OPTIMIZE.C :            tree_rest_of_compilation()
  15. PASSES.C     :               execute_pass_list() // 最適化パスのリスト実行
  16. PASSES.C     :                execute_one_pass()
  17. FINAL.C      :                 rest_of_handle_final() // Turn the RTL into assembly. struct tree_opt_pass pass_final->execute()
  18. FINAL.C      :                  final() // バックエンドも最後のパス
  19. FINAL.C      :                   final_scan_insn()
  20. FINAL.C      :                    output_asm_insn()