JuliaLang · rongcuid · Jun 12, 2019 · Jun 12, 2019 · Jun 12, 2019 · Jun 12, 2019
diff --git a/src/handlers.jl b/src/handlers.jl
@@ -190,17 +190,39 @@ end
 docdict(s::AbstractString) = display_dict(Core.eval(Main, helpmode(devnull, s)))
 
 import Base: is_id_char, is_id_start_char
-function get_token(code, pos)
-    # given a string and a cursor position, find substring to request
-    # help on by:
-    #   1) searching backwards, skipping invalid identifier chars
-    #        ... search forward for end of identifier
-    #   2) search backwards to find the biggest identifier (including .)
-    #   3) if nothing found, do return empty string
-    # TODO: detect operators?
 
+function get_previous_token(code, pos, crossed_parentheses)
+    """
+        get_previous_token(code, pos, crossed_parentheses)
+
+    Given a string and a cursor position, find substring corresponding to previous token.
+    `crossed_parentheses:Int` keeps track of how many parentheses have been crossed.
+    A pair of parentheses yields 0 crossing; a '(' add 1; a ')' subtracts 1.
+
+    Returns `(startpos, endpos, crossed_parentheses, stop)`
+
+    - `startpos` is the start position of the closest potential token before `pos`.
+    - `endpos` is end position if said token is can be valid identifier, or `-1` otherwise
+    - `crossed_parentheses` is the new count for parentheses.
+    - `stop` is true if ';' is hit, denoting the beginning of a clause.
+    """
     startpos = pos
+    separator = false
+    stop = false
     while startpos > firstindex(code)
+        c = code[startpos]
+        if c == '('
+            crossed_parentheses += 1
+            selarator = false
+        elseif c == ')'
+            crossed_parentheses -= 1
+            separator = false
+        elseif c == ';'
+            stop = true
+        elseif !is_id_char(c) && !isspace(c) && !separator
+            separator = true
+            crossed_parentheses = max(0, crossed_parentheses - 1)
+        end
         if is_id_char(code[startpos])
             break
         else
@@ -213,15 +235,88 @@ function get_token(code, pos)
     end
     startpos = startpos < pos ? nextind(code, startpos) : pos
     if !is_id_start_char(code[startpos])
-        return ""
+        return startpos, -1, crossed_parentheses, stop
     end
     while endpos < lastindex(code) && is_id_char(code[endpos])
         endpos = nextind(code, endpos)
     end
     if !is_id_char(code[endpos])
         endpos = prevind(code, endpos)
     end
-    return code[startpos:endpos]
+    return startpos, endpos, crossed_parentheses, stop
+end
+
+function get_token(code, pos)
+    """
+        get_token(code, pos)
+
+    Given a string and a cursor position, find substring to request
+    help on by:
+
+    1. Searching backwards for the closest token (may be invalid)
+    2. Keep searching backwards until we find an token before an unbalanced '('
+        a. If (1) is not valid, store the first valid token
+        b. We assume a token before an unbalanced '(' is a function
+    3. If we find a possible function token, return this token.
+    4. Otherwise, return the last valid token
+
+    # Important Note
+
+    Tokens are chosen following several empirical observations instead of rigorous rules.
+    We assume that the first valid token before left-imbalanced (more '(' than ')') parentheses is the function "closest" to cursor.
+    The following examples use '|' to denote cursor, showing observations on parentheses.
+
+    - `f()|` has balanced parentheses with nothing within, thus `f` is the desired token.
+    - `f(|)` has imbalanced parentheses, thus `f` is the desired token.
+    - `f(x|, y)` gives tokens `x` and `f`. `x` has balanced parentheses, while `f` is left-imbalanced. `f` is desired.
+    - `f(x)|` returns `f`
+    - `f(x, y)|` returns `f`.
+    - `f((x|))` returns `f`, as expected
+    - `f(x, (|y))` returns `f`. **This is a hack**, as I deduct `crossed_parentheses` whenever a separator is encountered, clamped to 0!
+        Otherwise, `x` would be returned.
+    - `f(x, (y|))`, `f(x, (y)|)`, and `f(x, (y))|` all behave as above. Arbitrary nesting of tuples should not cause misbehavior.
+    - `expr1 ; expr2`, cursor in `expr2` never causes search in `expr1`
+
+    TODO: detect operators? More robust parsing using the Julia parser instead of string hacks?
+    """
+
+    # Keep cursor in code range
+    pos = max(1, pos)
+    pos = min(pos, length(code))
+
+    crossed_parentheses = 0
+    prev_startpos, prev_endpos, crossed_parentheses, stop =
+        get_previous_token(code, pos, crossed_parentheses)
+    startpos = prev_startpos
+    endpos = prev_endpos # Does not matter
+    last_valid_start = startpos
+    last_valid_end = -1
+    while !stop && startpos > firstindex(code) && crossed_parentheses <= 0
+        pos = prevind(code, startpos)
+        startpos, endpos, crossed_parentheses, stop = get_previous_token(code, pos, crossed_parentheses)
+        if endpos != -1 && last_valid_end == -1
+            last_valid_start = startpos
+            last_valid_end = endpos
+        end
+    end
+
+    token = ""
+    if crossed_parentheses > 0 # Potential function token
+        if endpos != -1 # Function token valid
+            token = code[startpos:endpos]
+        elseif prev_endpos != -1 # Closest token valid
+            token = code[prev_startpos:prev_endpos]
+        elseif last_valid_end != -1 # Another, farther token valid
+            token = code[last_valid_start:last_valid_end]
+        end
+    else # No function token found
+        if prev_endpos != -1 # Closest token valid
+            token = code[prev_startpos:prev_endpos]
+        elseif last_valid_end != -1 # Another, farther token valid
+            token = code[last_valid_start:last_valid_end]
+        end
+    end
+    return token
 end
 
 function inspect_request(socket, msg)

diff --git a/test/completion.jl b/test/completion.jl
@@ -0,0 +1,57 @@
+using Test
+import IJulia: get_token
+
+@testset "completion tokenizer" begin
+    test0_code = "x + y + z"
+    test0_expected = "xxxxyyyyz"
+    test0_got = map(i -> get_token(test0_code, i), 1:length(test0_code))
+    @test split(test0_expected, "") == test0_got
+
+    test1_code      = """println("Hello world")"""
+    test1_expected  = "println"
+    test1_got = map(i -> get_token(test1_code, i), 1:length(test1_code))
+    @test all(test1_expected .== test1_got)
+
+    test2_code      = """println("Hello world", x)"""
+    test2_expected  = "println"
+    test2_got = map(i -> get_token(test2_code, i), 1:length(test2_code))
+    @test all(test2_expected .== test2_got)
+
+    test3_code      = """println("Hello world", x, y)"""
+    test3_expected  = "println"
+    test3_got = map(i -> get_token(test3_code, i), 1:length(test3_code))
+    @test all(test3_expected .== test3_got)
+
+    test4_code      = """println("Hello world", (x, y))"""
+    test4_expected  = "println"
+    test4_got = map(i -> get_token(test4_code, i), 1:length(test4_code))
+    @test all(test4_expected .== test4_got)
+
+    test5_code      = """println("Hello world", (x, y, (2 + 3 - 5)))"""
+    test5_expected  = "println"
+    test5_got = map(i -> get_token(test5_code, i), 1:length(test5_code))
+    @test all(test5_expected .== test5_got)
+
+    #=
+    # TODO These won't work in current, "hacky" implementation.
+    # Current implementation treats each token separate, returning
+    # "Vector", "Int", "undef", "n" respectively
+    test6_code      = """Vector{Int}(undef, n)"""
+    test6_expected  = "Vector"
+    test6_got = map(i -> get_token(test6_code, i), 1:length(test6_code))
+    @test all(test6_expected .== test6_got)
+    =#
+
+    #=
+    # TODO These won't work either, mostly due to the same reason
+    test7_code      = """f(g(x))"""
+    test7_expected  = """fffggff"""
+    test7_got = map(i -> get_token(test7_code, i), 1:length(test7_code))
+    @test split(test7_expected, "") == test7_got
+    =#
+
+    test8_code      = """f(x, g(x))"""
+    test8_expected  = """ffffffggff"""
+    test8_got = map(i -> get_token(test8_code, i), 1:length(test8_code))
+    @test split(test8_expected, "") == test8_got
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,4 +1,4 @@
-for file in ["install.jl","comm.jl", "msg.jl", "execute_request.jl", "stdio.jl"]
+for file in ["install.jl","comm.jl", "completion.jl", "msg.jl", "execute_request.jl", "stdio.jl"]
     println(file)
     include(file)
 end