Skip to content
This repository has been archived by the owner on Mar 8, 2020. It is now read-only.

Define a uast.ContentOf helper #377

Merged
merged 1 commit into from
Mar 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 65 additions & 30 deletions uast/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ var (
)

var (
namespaces = make(map[string]string)
package2ns = make(map[string]string)
type2name = make(map[reflect.Type]nodeID)
name2type = make(map[nodeID]reflect.Type)
namespaces = make(map[string]string) // namespace to package
package2ns = make(map[string]string) // package to namespace
type2name = make(map[reflect.Type]nodeID)
name2type = make(map[nodeID]reflect.Type)
typeContentKey = make(map[string]string) // ns:type to "content" field name
)

func parseNodeID(s string) nodeID {
Expand Down Expand Up @@ -76,16 +77,36 @@ func RegisterPackage(ns string, types ...interface{}) {
package2ns[pkg] = ns

for _, o := range types {
rt := reflect.TypeOf(o)
if rt.Kind() == reflect.Ptr {
rt = rt.Elem()
registerType(ns, o)
}
}

func registerType(ns string, o interface{}) {
rt := reflect.TypeOf(o)
if rt.Kind() == reflect.Ptr {
rt = rt.Elem()
}
if name, ok := type2name[rt]; ok {
panic(fmt.Errorf("type %v already registered under %s name", rt, name))
}
id := nodeID{NS: ns, Name: rt.Name()}
type2name[rt] = id
name2type[id] = rt
if rt.Kind() != reflect.Struct {
return
}
for i := 0; i < rt.NumField(); i++ {
f := rt.Field(i)
if f.Anonymous {
continue // do not inherit content field
}
if name, ok := type2name[rt]; ok {
panic(fmt.Errorf("type %v already registered under %s name", rt, name))
d, err := getFieldDesc(f)
if err != nil {
panic(err)
}
if d.Content {
typeContentKey[id.String()] = d.Name
}
name := nodeID{NS: ns, Name: rt.Name()}
type2name[rt] = name
name2type[name] = rt
}
}

Expand All @@ -107,7 +128,7 @@ func zeroFieldsTo(obj, opt nodes.Object, rt reflect.Type) error {
}
continue
}
name, omit, err := fieldName(f)
d, err := getFieldDesc(f)
if err != nil {
return err
}
Expand All @@ -124,10 +145,10 @@ func zeroFieldsTo(obj, opt nodes.Object, rt reflect.Type) error {
case reflect.Uint, reflect.Uint64, reflect.Uint32, reflect.Uint16, reflect.Uint8:
v = nodes.Uint(0)
}
if omit {
opt[name] = v
if d.OmitEmpty {
opt[d.Name] = v
} else {
obj[name] = v
obj[d.Name] = v
}
}
return nil
Expand Down Expand Up @@ -217,23 +238,37 @@ func typeOf(tp reflect.Type) nodeID {
return nodeID{NS: ns, Name: name}
}

func fieldName(f reflect.StructField) (string, bool, error) {
name := strings.SplitN(f.Tag.Get("uast"), ",", 2)[0]
omitempty := false
if name == "" {
type fieldDesc struct {
Name string
OmitEmpty bool
Content bool
}

func getFieldDesc(f reflect.StructField) (fieldDesc, error) {
uastTag := strings.Split(f.Tag.Get("uast"), ",")
desc := fieldDesc{
Name: uastTag[0],
}
for _, s := range uastTag[1:] {
if s == "content" {
desc.Content = true
break
}
}
if desc.Name == "" {
tags := strings.Split(f.Tag.Get("json"), ",")
for _, s := range tags[1:] {
if s == "omitempty" {
omitempty = true
desc.OmitEmpty = true
break
}
}
name = tags[0]
desc.Name = tags[0]
}
if name == "" {
return "", false, fmt.Errorf("field %s should have uast or json name", f.Name)
if desc.Name == "" {
return desc, fmt.Errorf("field %s should have uast or json name", f.Name)
}
return name, omitempty, nil
return desc, nil
}

var (
Expand Down Expand Up @@ -344,18 +379,18 @@ func structToNode(obj nodes.Object, rv reflect.Value, rt reflect.Type) error {
}
continue
}
name, omit, err := fieldName(ft)
d, err := getFieldDesc(ft)
if err != nil {
return fmt.Errorf("type %s: %v", rt.Name(), err)
}
v, err := toNodeReflect(f)
if err != nil {
return err
}
if v == nil && omit {
if v == nil && d.OmitEmpty {
continue
}
obj[name] = v
obj[d.Name] = v
}
return nil
}
Expand Down Expand Up @@ -533,11 +568,11 @@ func nodeToStruct(rv reflect.Value, rt reflect.Type, obj nodes.ExternalObject) e
}
continue
}
name, _, err := fieldName(ft)
d, err := getFieldDesc(ft)
if err != nil {
return fmt.Errorf("type %s: %v", rt.Name(), err)
}
v, ok := obj.ValueAt(name)
v, ok := obj.ValueAt(d.Name)
if !ok {
continue
}
Expand Down
35 changes: 30 additions & 5 deletions uast/uast.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,11 @@ func RolesOf(n nodes.Node) role.Roles {
}

// TokenOf is a helper for getting node token (see KeyToken).
//
// The token is an exact code snippet that represents a given AST node. It only works for
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not necessarily in this change, but we should think about how to document expectations for the user of the UAST, about which nodes can be expected to have exact snippets and which do not. (For example: Do numeric literals have one? I think so, but the reader could not be sure, and I couldn't find a table of rules anywhere).

Anyway, I think that's something we should figure out how to document.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Relatedly: I'm worried about the expectations set up by the native/annotated vs. semantic split: If someone calls TokenOf(n) on a Semantic node n, will they get "" always? Or will it just magically work sometimes and not others? Or will it work but return canonicalized text?

For the caller, I think that disparity is going to be frustrating. Maybe we could fix it by making the API distinguish ("TokenOf always returns an empty string in Semantic mode") or maybe we should just have a single API (TextOf?) whose return value is canonicalized or not depending on mode. I don't know the right answer here—and we don't necessarily need to figure it out right now—but I am concerned that we are adding to the API surface only to satisfy a particular use case.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TokenOf by definition should always either return a code snippet or nothing. Unfortunately defining all the node types that will have the token is impossible since it differs between the languages even for trivial nodes like string literals (C# has no token in the literal node, but has a separate token node).

In general, I plan for TokenOf to always return a code snippet for any native AST. But it needs access to the source file to do it and a different kind of API. In SDK v3 I think it makes sense to replace all node types with opaque *Node that will also keep a pointer to the original source file, so we can always get snippets based on the node position.

Regarding Semantic nodes, it's a bit tricky. We can still get the snippet based on positions, but Semantic nodes might have a completely different structure and the snippet for inner node might be larger than a snipped for the parent node. Because of this, it may make sense to give a different API promise. We may allow storing both Native and Semantic nodes (+ the source) and clearly communicate in docs that to get the code snippet (token) for a subtree, the user will first need to jump from Semantic node to associated Native. This will may help to draw a clear and easy to understand boundary for the end user.

For the ContentOf, as explained in the original issue, it's not meant to be very precise in contrast to TokenOf. It's just a quick way to get any text content for a specific node, thus it will try to first use any canonical text field of Semantic nodes, and if they are not available - fallback to TokenOf. So they really serve a different purpose.

// primitive nodes like identifiers and string literals, and is only available in Native
// and Annotated parsing modes. For Semantic mode, see ContentOf.
//
// It returns an empty string if the node is not an object, or there is no token.
func TokenOf(n nodes.Node) string {
switch n := n.(type) {
Expand All @@ -262,7 +267,7 @@ func TokenOf(n nodes.Node) string {
return ""
}

// Tokens collects all tokens of the tree recursively (pre-order).
// Tokens collects all tokens of the tree recursively (pre-order). See TokenOf.
func Tokens(n nodes.Node) []string {
var tokens []string
nodes.WalkPreOrder(n, func(n nodes.Node) bool {
Expand All @@ -276,6 +281,26 @@ func Tokens(n nodes.Node) []string {
return tokens
}

// ContentOf returns any relevant string content of a node. It returns a Name for
// Identifiers, Value for Strings, etc and uses TokenOf for non-Semantic nodes.
//
// The result may not exactly match the source file since values in Semantic nodes
// are normalized.
//
// It returns an empty string if the node has no string content.
func ContentOf(n nodes.Node) string {
if obj, ok := n.(nodes.Object); ok {
typ, _ := obj[KeyType].(nodes.String)

if field, ok := typeContentKey[string(typ)]; ok {
// allow nested objects
return ContentOf(obj[field])
}
}
// fallback to token
return TokenOf(n)
}

// HashNoPos hashes the node, but skips positional information.
func HashNoPos(n nodes.External) nodes.Hash {
h := nodes.NewHasher()
Expand Down Expand Up @@ -310,7 +335,7 @@ type GenNode struct {
type Identifier struct {
GenNode
// Name of an entity. Can be any valid UTF8 string.
Name string `json:"Name"`
Name string `json:"Name" uast:",content"`
}

// Roles returns a list of UAST node roles that apply to this node.
Expand All @@ -337,7 +362,7 @@ type String struct {
// Value is a UTF8 string literal value.
//
// Drivers should remove any quotes and unescape the value according to the language rules.
Value string `json:"Value"`
Value string `json:"Value" uast:",content"`

// Format is an optional language-specific string that describes the format of the literal.
//
Expand Down Expand Up @@ -388,7 +413,7 @@ type Comment struct {
// */
//
// only "some comment" is considered a text
Text string `json:"Text"`
Text string `json:"Text" uast:",content"`

// Prefix is a set of whitespaces and stylistic characters that appear before
// the first line of an actual comment text.
Expand Down Expand Up @@ -606,5 +631,5 @@ type Function struct {
// Bool is a boolean literal.
type Bool struct {
GenNode
Value bool `json:"Value"`
Value bool `json:"Value" uast:",content"`
}
63 changes: 63 additions & 0 deletions uast/uast_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,69 @@ func tObj(typ, tok string) Obj {
return obj
}

func TestContentOf(t *testing.T) {
var cases = []struct {
name string
node interface{}
exp string
}{
{
name: "string",
node: nodes.String("a"),
exp: "a",
},
{
name: "int",
node: nodes.Int(1),
exp: "1",
},
{
name: "string lit",
node: String{
Value: "a",
},
exp: "a",
},
{
name: "bool lit",
node: Bool{
Value: true,
},
exp: "true",
},
{
name: "identifier",
node: Identifier{
Name: "a",
},
exp: "a",
},
{
name: "comment",
node: Comment{
Prefix: " ",
Text: "a",
},
exp: "a",
},
{
name: "array",
node: nodes.Array{
nodes.String("a"),
nodes.String("b"),
},
exp: "", // TODO(dennwc): define it later if we find a valid use case for it
},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
n, err := ToNode(c.node)
require.NoError(t, err)
require.Equal(t, c.exp, ContentOf(n))
})
}
}

func TestPrefixTokens(t *testing.T) {
require := require.New(t)

Expand Down