Skip to content

Commit

Permalink
Parse YAML-LD embedded in HTML script elements.
Browse files Browse the repository at this point in the history
  • Loading branch information
gkellogg committed Feb 3, 2024
1 parent c492900 commit d851238
Show file tree
Hide file tree
Showing 5 changed files with 288 additions and 0 deletions.
9 changes: 9 additions & 0 deletions examples/indented-stream.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"@context": https://json-ld.org/contexts/person.jsonld
"@id": http://dbpedia.org/resource/John_Lennon
name: John Lennon
born: 1940-10-09
spouse: http://dbpedia.org/resource/Cynthia_Lennon
---
"@context": https://json-ld.org/contexts/person.jsonld
"@id": http://dbpedia.org/resource/Cynthia_Lennon
born: 1939-09-10
15 changes: 15 additions & 0 deletions lib/yaml_ld/api.rb
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def self.expand(input,
serializer: self.method(:serializer),
**options,
&block)
JSON::LD::API.add_script_loader('application/ld+yaml', self.method(:htmlLoader))
JSON::LD::API.expand(input,
allowed_content_types: %r(application/(.+\+)?yaml),
documentLoader: documentLoader,
Expand Down Expand Up @@ -113,6 +114,7 @@ def self.compact(input, context, expanded: false,
serializer: self.method(:serializer),
**options,
&block)
JSON::LD::API.add_script_loader('application/ld+yaml', self.method(:htmlLoader))
JSON::LD::API.compact(input, context, expanded: expanded,
allowed_content_types: %r(application/(.+\+)?yaml),
documentLoader: documentLoader,
Expand Down Expand Up @@ -154,6 +156,7 @@ def self.flatten(input, context, expanded: false,
serializer: self.method(:serializer),
**options,
&block)
JSON::LD::API.add_script_loader('application/ld+yaml', self.method(:htmlLoader))
JSON::LD::API.flatten(input, context, expanded: expanded,
allowed_content_types: %r(application/(.+\+)?yaml),
documentLoader: documentLoader,
Expand Down Expand Up @@ -200,6 +203,7 @@ def self.frame(input, frame, expanded: false,
serializer: self.method(:serializer),
**options,
&block)
JSON::LD::API.add_script_loader('application/ld+yaml', self.method(:htmlLoader))
JSON::LD::API.frame(input, frame, expanded: expanded,
allowed_content_types: %r(application/(.+\+)?yaml),
documentLoader: documentLoader,
Expand Down Expand Up @@ -229,6 +233,7 @@ def self.toRdf(input, expanded: false,
documentLoader: self.method(:documentLoader),
**options,
&block)
JSON::LD::API.add_script_loader('application/ld+yaml', self.method(:htmlLoader))
JSON::LD::API.toRdf(input, expanded: expanded,
allowed_content_types: %r(application/(.+\+)?yaml),
documentLoader: documentLoader,
Expand Down Expand Up @@ -334,6 +339,16 @@ def self.documentLoader(url, extractAllScripts: false, profile: nil, requestProf
end
end

##
# Extracts a single YAML script, or a stream of YAML scripts from HTML script tags.
def self.htmlLoader(content, url:, extractAllScripts: false, **options)
if extractAllScripts
Representation.load_stream(content.unindent, filename: url.to_s, **options)
else
Representation.load(content, filename: url.to_s, **options)
end
end

##
# The default serializer for serialzing Ruby Objects to JSON.
#
Expand Down
163 changes: 163 additions & 0 deletions spec/expand_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,169 @@
end
end

context "html" do
%w[REXML].each do |impl|
next unless Module.constants.map(&:to_s).include?(impl)

context impl do
let(:library) { impl.downcase.to_s.to_sym }

{
'Expands embedded YAML-LD script element': {
input: %(
<html>
<head>
<script type="application/ld+yaml">
"@context":
"foo":
"@id": "http://example.com/foo"
"@container": "@list"
"foo":
"@value": "bar"
</script>
</head>
</html>),
output: %([{
"http://example.com/foo": [{"@list": [{"@value": "bar"}]}]
}])
},
'Expands first script element': {
input: %(
<html>
<head>
<script type="application/ld+yaml">
"@context":
foo:
"@id": http://example.com/foo
"@container": "@list"
foo:
- "@value": bar
</script>
<script type="application/ld+yaml">
"@context":
ex: http://example.com/
"@graph":
- ex:foo:
"@value": foo
- ex:bar:
"@value": bar
</script>
</head>
</html>),
output: %([{
"http://example.com/foo": [{"@list": [{"@value": "bar"}]}]
}])
},
'Expands targeted script element': {
input: %(
<html>
<head>
<script id="first" type="application/ld+json">
{
"@context": {
"foo": {"@id": "http://example.com/foo", "@container": "@list"}
},
"foo": [{"@value": "bar"}]
}
</script>
<script id="second" type="application/ld+json">
{
"@context": {"ex": "http://example.com/"},
"@graph": [
{"ex:foo": {"@value": "foo"}},
{"ex:bar": {"@value": "bar"}}
]
}
</script>
</head>
</html>),
output: %([
{"http://example.com/foo": [{"@value": "foo"}]},
{"http://example.com/bar": [{"@value": "bar"}]}
]),
base: "http://example.org/doc#second"
},
'Expands all script elements with extractAllScripts option': {
input: %(
<html>
<head>
<script type="application/ld+yaml">
"@context":
foo:
"@id": http://example.com/foo
"@container": "@list"
foo:
- "@value": bar
</script>
<script type="application/ld+yaml">
"@context":
ex: http://example.com/
"@graph":
- ex:foo:
"@value": foo
- ex:bar:
"@value": bar
</script>
</head>
</html>),
output: %([
{"http://example.com/foo": [{"@list": [{"@value": "bar"}]}]},
{
"@graph": [{
"http://example.com/foo": [{"@value": "foo"}]
}, {
"http://example.com/bar": [{"@value": "bar"}]
}]
}
]),
extractAllScripts: true
},
'Expands all script elements with extractAllScripts option (doc stream)': {
input: %(
<html>
<head>
<script type="application/ld+yaml">
"@context":
foo:
"@id": http://example.com/foo
"@container": "@list"
foo:
- "@value": bar
---
"@context":
ex: http://example.com/
"@graph":
- ex:foo:
"@value": foo
- ex:bar:
"@value": bar
</script>
</head>
</html>),
output: %([
{"http://example.com/foo": [{"@list": [{"@value": "bar"}]}]},
{
"@graph": [{
"http://example.com/foo": [{"@value": "foo"}]
}, {
"http://example.com/bar": [{"@value": "bar"}]
}]
}
]),
extractAllScripts: true
},
}.each do |title, params|
it(title) do
skip "rexml" if params[:not] == library
params = params.merge(input: StringIO.new(params[:input]))
params[:input].send(:define_singleton_method, :content_type) { "text/html" }
run_expand params.merge(validate: true, library: library)
end
end
end
end
end

context "JSON-LD-star" do
{
"node with embedded subject without rdfstar option": {
Expand Down
100 changes: 100 additions & 0 deletions spec/suite_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -406,5 +406,105 @@ def documentLoader(url, **options, &block)
"don't raise error"
end
module_function :documentLoader

##
# Load one or more script tags from an HTML source.
# Unescapes and uncomments input, returns the internal representation
# Yields document base
# @param [String] input
# @param [String] url Original URL
# @param [:nokogiri, :rexml] library (nil)
# @param [Boolean] extractAllScripts (false)
# @param [Boolean] profile (nil) Optional priortized profile when loading a single script by type.
# @param [Hash{Symbol => Object}] options
def self.load_html(input, url:,
library: nil,
extractAllScripts: false,
profile: nil,
**options)

if input.is_a?(String)
library ||= begin
require 'nokogiri'
:nokogiri
rescue LoadError
:rexml
end
require "json/ld/html/#{library}"

# Parse HTML using the appropriate library
implementation = case library
when :nokogiri then Nokogiri
when :rexml then REXML
end
extend(implementation)

input = begin
send("initialize_html_#{library}".to_sym, input, **options)
rescue StandardError
raise JSON::LD::JsonLdError::LoadingDocumentFailed, "Malformed HTML document: #{$ERROR_INFO.message}"
end

# Potentially update options[:base]
if (html_base = input.at_xpath("/html/head/base/@href"))
base = RDF::URI(url) if url
html_base = RDF::URI(html_base)
html_base = base.join(html_base) if base
yield html_base
end
end

url = RDF::URI.parse(url)
if url.fragment
id = CGI.unescape(url.fragment)
# Find script with an ID based on that fragment.
element = input.at_xpath("//script[@id='#{id}']")
raise JSON::LD::JsonLdError::LoadingDocumentFailed, "No script tag found with id=#{id}" unless element

unless element.attributes['type'].to_s.start_with?('application/ld+json')
raise JSON::LD::JsonLdError::LoadingDocumentFailed,
"Script tag has type=#{element.attributes['type']}"
end

content = element.inner_html
validate_input(content, url: url) if options[:validate]
mj_opts = options.keep_if { |k, v| k != :adapter || MUTLI_JSON_ADAPTERS.include?(v) }
MultiJson.load(content, **mj_opts)
elsif extractAllScripts
res = []
elements = if profile
es = input.xpath("//script[starts-with(@type, 'application/ld+json;profile=#{profile}')]")
# If no profile script, just take a single script without profile
es = [input.at_xpath("//script[starts-with(@type, 'application/ld+json')]")].compact if es.empty?
es
else
input.xpath("//script[starts-with(@type, 'application/ld+json')]")
end
elements.each do |element|
content = element.inner_html
validate_input(content, url: url) if options[:validate]
mj_opts = options.keep_if { |k, v| k != :adapter || MUTLI_JSON_ADAPTERS.include?(v) }
r = MultiJson.load(content, **mj_opts)
if r.is_a?(Hash)
res << r
elsif r.is_a?(Array)
res.concat(r)
end
end
res
else
# Find the first script with type application/ld+json.
element = input.at_xpath("//script[starts-with(@type, 'application/ld+json;profile=#{profile}')]") if profile
element ||= input.at_xpath("//script[starts-with(@type, 'application/ld+json')]")
raise JSON::LD::JsonLdError::LoadingDocumentFailed, "No script tag found" unless element

content = element.inner_html
validate_input(content, url: url) if options[:validate]
mj_opts = options.keep_if { |k, v| k != :adapter || MUTLI_JSON_ADAPTERS.include?(v) }
MultiJson.load(content, **mj_opts)
end
rescue MultiJson::ParseError => e
raise JSON::LD::JsonLdError::InvalidScriptElement, e.message
end
end
end
1 change: 1 addition & 0 deletions yaml-ld.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Gem::Specification.new do |gem|
gem.add_runtime_dependency 'psych', '>= 3.3' # Rails 6.0 cannot use psych 4.0
gem.add_runtime_dependency 'rdf', '~> 3.3'
gem.add_runtime_dependency 'rdf-xsd', '~> 3.3'
gem.add_runtime_dependency 'rexml', '~> 3.2'
gem.add_development_dependency 'getoptlong', '~> 0.2'
gem.add_development_dependency 'rdf-isomorphic', '~> 3.3'
gem.add_development_dependency 'rdf-spec', '~> 3.3'
Expand Down

0 comments on commit d851238

Please sign in to comment.