diff --git a/chapter08_computer-vision/object-detection.ipynb b/chapter08_computer-vision/object-detection.ipynb index 9fd6a6b..aad2f6e 100644 --- a/chapter08_computer-vision/object-detection.ipynb +++ b/chapter08_computer-vision/object-detection.ipynb @@ -25,10 +25,10 @@ "so we'll abuse terminology just a little.\n", "Second, while classifiers need only to output probabilities over classes,\n", "object detectors must output both probabilities of class membership\n", - "and aso the coordinates that identify the location of the objects.\n", + "and also the coordinates that identify the location of the objects.\n", "\n", "\n", - "On this chapter we'll demonstrate the single shot multiple object detector (SSD),\n", + "On this chapter we'll demonstrate the single shot multiple box object detector (SSD),\n", "a popular model for object detection that was first described in [this paper](https://arxiv.org/abs/1512.02325),\n", "and is straightforward to implement in MXNet Gluon.\n" ] @@ -43,11 +43,10 @@ "\n", "The SSD model predicts anchor boxes at multiple scales. The model architecture is illustrated in the following figure. \n", "\n", - "![](data:image/svg+xml;base64,<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" version="1.1" viewBox="363 277 513 269" width="513pt" height="269pt" xmlns:dc="http://purl.org/dc/elements/1.1/"><metadata> Produced by OmniGraffle 6.6.1 <dc:date>2017-08-15 21:47:34 +0000</dc:date></metadata><defs><marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="SharpArrow_Marker" viewBox="-4 -4 10 8" markerWidth="10" markerHeight="8" color="black"><g><path d="M 5 0 L -3 -3 L 0 0 L 0 0 L -3 3 Z" fill="currentColor" stroke="currentColor" stroke-width="1"/></g></marker><font-face font-family="Helvetica Neue" font-size="16" panose-1="2 0 5 3 0 0 0 2 0 4" units-per-em="1000" underline-position="-100" underline-thickness="50" slope="0" x-height="517" cap-height="714" ascent="951.99585" descent="-212.99744" font-weight="500"><font-face-src><font-face-name name="HelveticaNeue"/></font-face-src></font-face><linearGradient x1="0" x2="1" id="Gradient" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#dadada"/><stop offset="1" stop-color="#a5a5a5"/></linearGradient><linearGradient id="Obj_Gradient" xl:href="#Gradient" gradientTransform="translate(432.2213 392.56777) rotate(-178) scale(61.1913)"/><linearGradient x1="0" x2="1" id="Gradient_2" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#dadada"/><stop offset="1" stop-color="#dadada"/></linearGradient><linearGradient id="Obj_Gradient_2" xl:href="#Gradient_2" gradientTransform="translate(449.34076 417.78345) rotate(-178) scale(23.407182)"/><linearGradient id="Obj_Gradient_3" xl:href="#Gradient_2" gradientTransform="translate(447.50292 315.16213) rotate(-178) scale(73.76219)"/><linearGradient id="Obj_Gradient_4" xl:href="#Gradient" gradientTransform="translate(737.29897 435.32794) rotate(-178) scale(36.870395)"/><linearGradient id="Obj_Gradient_5" xl:href="#Gradient_2" gradientTransform="translate(748.1703 446.17186) rotate(-178) scale(13.598863)"/><linearGradient id="Obj_Gradient_6" xl:href="#Gradient_2" gradientTransform="translate(747.36498 401.73303) rotate(-178) scale(45.757826)"/><linearGradient id="Obj_Gradient_7" xl:href="#Gradient" gradientTransform="translate(580.73246 418.93854) rotate(-178) scale(52.246744)"/><linearGradient id="Obj_Gradient_8" xl:href="#Gradient_2" gradientTransform="translate(595.87383 436.36728) rotate(-178) scale(19.509551)"/><linearGradient id="Obj_Gradient_9" xl:href="#Gradient_2" gradientTransform="translate(594.58944 365.14743) rotate(-178) scale(64.218004)"/></defs><g stroke="none" stroke-opacity="1" stroke-dasharray="none" fill="none" fill-opacity="1"><title>Canvas 1</title><rect fill="white" width="1037" height="619"/><g><title>Layer 1</title><line x1="746.32886" y1="425.32886" x2="821.10005" y2="425.61765" marker-end="url(#SharpArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><line x1="720" y1="398.95973" x2="794.7712" y2="398.67094" marker-end="url(#SharpArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><text transform="translate(729.698 374.776)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".276" y="15" textLength="59.552">class_pr</tspan><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x="59.54" y="15" textLength="41.184">editor</tspan></text><text transform="translate(762 402.44714)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".368" y="15" textLength="49.776">box_pr</tspan><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x="49.856" y="15" textLength="49.776">edictor</tspan></text><path d="M 374.6443 288 L 374.6443 443.25 L 428.6443 495 L 428.6443 339.75 Z" fill="url(#Obj_Gradient)"/><path d="M 374.6443 288 L 374.6443 443.25 L 428.6443 495 L 428.6443 339.75 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><rect x="428.6443" y="339.75" width="18" height="155.25" fill="url(#Obj_Gradient_2)"/><rect x="428.6443" y="339.75" width="18" height="155.25" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><path d="M 374.6443 288 L 428.6443 339.75 L 446.6443 339.75 L 392.6443 288 Z" fill="url(#Obj_Gradient_3)"/><path d="M 374.6443 288 L 428.6443 339.75 L 446.6443 339.75 L 392.6443 288 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><path d="M 702 389.68456 L 702 457.18456 L 735.75 479.68456 L 735.75 412.18456 Z" fill="url(#Obj_Gradient_4)"/><path d="M 702 389.68456 L 702 457.18456 L 735.75 479.68456 L 735.75 412.18456 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><rect x="735.75" y="412.18456" width="11.25" height="67.5" fill="url(#Obj_Gradient_5)"/><rect x="735.75" y="412.18456" width="11.25" height="67.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><path d="M 702 389.68456 L 735.75 412.18456 L 747 412.18456 L 714.85714 389.68456 Z" fill="url(#Obj_Gradient_6)"/><path d="M 702 389.68456 L 735.75 412.18456 L 747 412.18456 L 714.85714 389.68456 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><line x1="446.6443" y1="417.42515" x2="524.1001" y2="417.85675" marker-end="url(#SharpArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><text transform="translate(469.7886 393.76258)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".42" y="15" textLength="36.16">body</tspan></text><line x1="584.32886" y1="377.68456" x2="659.10005" y2="377.97335" marker-end="url(#SharpArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><line x1="549" y1="351.31544" x2="623.7712" y2="351.02665" marker-end="url(#SharpArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><line x1="594" y1="435.94722" x2="695.10035" y2="434.92496" marker-end="url(#SharpArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><text transform="translate(559.5625 512.776)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".164" y="15" textLength="50.672">scale 0</tspan></text><text transform="translate(718 512.15513)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".164" y="15" textLength="50.672">scale 1</tspan></text><text transform="translate(417.0625 512.776)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".064" y="15" textLength="35.872">input</tspan></text><text transform="translate(601.39136 411.09144)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".216" y="15" textLength="91.568">downsample</tspan></text><text transform="translate(558.698 327.1317)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".276" y="15" textLength="59.552">class_pr</tspan><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x="59.54" y="15" textLength="41.184">editor</tspan></text><text transform="translate(600 354.80285)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".368" y="15" textLength="49.776">box_pr</tspan><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x="49.856" y="15" textLength="49.776">edictor</tspan></text><path d="M 531 346.02685 L 531 454.02685 L 578.25 490.02685 L 578.25 382.02685 Z" fill="url(#Obj_Gradient_7)"/><path d="M 531 346.02685 L 531 454.02685 L 578.25 490.02685 L 578.25 382.02685 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><rect x="578.25" y="382.02685" width="15.75" height="108" fill="url(#Obj_Gradient_8)"/><rect x="578.25" y="382.02685" width="15.75" height="108" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><path d="M 531 346.02685 L 578.25 382.02685 L 594 382.02685 L 549 346.02685 Z" fill="url(#Obj_Gradient_9)"/><path d="M 531 346.02685 L 578.25 382.02685 L 594 382.02685 L 549 346.02685 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/></g></g></svg>
)\n", + "![](data:image/svg+xml;base64,<?xml version="1.0" encoding="UTF-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" version="1.1" viewBox="363 277 511 269" width="511pt" height="269pt" xmlns:dc="http://purl.org/dc/elements/1.1/"><metadata> Produced by OmniGraffle 6.6.1 <dc:date>2017-09-12 18:59:01 +0000</dc:date></metadata><defs><marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="SharpArrow_Marker" viewBox="-4 -4 10 8" markerWidth="10" markerHeight="8" color="black"><g><path d="M 5 0 L -3 -3 L 0 0 L 0 0 L -3 3 Z" fill="currentColor" stroke="currentColor" stroke-width="1"/></g></marker><font-face font-family="Helvetica Neue" font-size="16" panose-1="2 0 5 3 0 0 0 2 0 4" units-per-em="1000" underline-position="-100" underline-thickness="50" slope="0" x-height="517" cap-height="714" ascent="951.99585" descent="-212.99744" font-weight="500"><font-face-src><font-face-name name="HelveticaNeue"/></font-face-src></font-face><linearGradient x1="0" x2="1" id="Gradient" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#dadada"/><stop offset="1" stop-color="#a5a5a5"/></linearGradient><linearGradient id="Obj_Gradient" xl:href="#Gradient" gradientTransform="translate(432.2213 392.56777) rotate(-178) scale(61.1913)"/><linearGradient x1="0" x2="1" id="Gradient_2" gradientUnits="userSpaceOnUse"><stop offset="0" stop-color="#dadada"/><stop offset="1" stop-color="#dadada"/></linearGradient><linearGradient id="Obj_Gradient_2" xl:href="#Gradient_2" gradientTransform="translate(449.34076 417.78345) rotate(-178) scale(23.407182)"/><linearGradient id="Obj_Gradient_3" xl:href="#Gradient_2" gradientTransform="translate(447.50292 315.16213) rotate(-178) scale(73.76219)"/><linearGradient id="Obj_Gradient_4" xl:href="#Gradient" gradientTransform="translate(737.29897 435.32794) rotate(-178) scale(36.870395)"/><linearGradient id="Obj_Gradient_5" xl:href="#Gradient_2" gradientTransform="translate(748.1703 446.17186) rotate(-178) scale(13.598863)"/><linearGradient id="Obj_Gradient_6" xl:href="#Gradient_2" gradientTransform="translate(747.36498 401.73303) rotate(-178) scale(45.757826)"/><linearGradient id="Obj_Gradient_7" xl:href="#Gradient" gradientTransform="translate(580.73246 418.93854) rotate(-178) scale(52.246744)"/><linearGradient id="Obj_Gradient_8" xl:href="#Gradient_2" gradientTransform="translate(595.87383 436.36728) rotate(-178) scale(19.509551)"/><linearGradient id="Obj_Gradient_9" xl:href="#Gradient_2" gradientTransform="translate(594.58944 365.14743) rotate(-178) scale(64.218004)"/></defs><g stroke="none" stroke-opacity="1" stroke-dasharray="none" fill="none" fill-opacity="1"><title>Canvas 1</title><rect fill="white" width="937" height="619"/><g><title>Layer 1</title><line x1="746.32886" y1="425.32886" x2="821.10005" y2="425.61765" marker-end="url(#SharpArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><line x1="720" y1="398.95973" x2="794.7712" y2="398.67094" marker-end="url(#SharpArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><text transform="translate(736.198 374.776)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".256" y="15" textLength="56">class pr</tspan><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x="55.968" y="15" textLength="49.776">edictor</tspan></text><text transform="translate(764 402.44714)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".144" y="15" textLength="46.224">box pr</tspan><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x="46.08" y="15" textLength="49.776">edictor</tspan></text><path d="M 374.6443 288 L 374.6443 443.25 L 428.6443 495 L 428.6443 339.75 Z" fill="url(#Obj_Gradient)"/><path d="M 374.6443 288 L 374.6443 443.25 L 428.6443 495 L 428.6443 339.75 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><rect x="428.6443" y="339.75" width="18" height="155.25" fill="url(#Obj_Gradient_2)"/><rect x="428.6443" y="339.75" width="18" height="155.25" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><path d="M 374.6443 288 L 428.6443 339.75 L 446.6443 339.75 L 392.6443 288 Z" fill="url(#Obj_Gradient_3)"/><path d="M 374.6443 288 L 428.6443 339.75 L 446.6443 339.75 L 392.6443 288 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><path d="M 702 389.68456 L 702 457.18456 L 735.75 479.68456 L 735.75 412.18456 Z" fill="url(#Obj_Gradient_4)"/><path d="M 702 389.68456 L 702 457.18456 L 735.75 479.68456 L 735.75 412.18456 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><rect x="735.75" y="412.18456" width="11.25" height="67.5" fill="url(#Obj_Gradient_5)"/><rect x="735.75" y="412.18456" width="11.25" height="67.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><path d="M 702 389.68456 L 735.75 412.18456 L 747 412.18456 L 714.85714 389.68456 Z" fill="url(#Obj_Gradient_6)"/><path d="M 702 389.68456 L 735.75 412.18456 L 747 412.18456 L 714.85714 389.68456 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><line x1="446.6443" y1="417.42515" x2="524.1001" y2="417.85675" marker-end="url(#SharpArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><text transform="translate(469.7886 393.76258)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".42" y="15" textLength="36.16">body</tspan></text><line x1="584.32886" y1="377.68456" x2="659.10005" y2="377.97335" marker-end="url(#SharpArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><line x1="549" y1="351.31544" x2="623.7712" y2="351.02665" marker-end="url(#SharpArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><line x1="594" y1="435.94722" x2="695.10035" y2="434.92496" marker-end="url(#SharpArrow_Marker)" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><text transform="translate(559.5625 512.776)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".164" y="15" textLength="50.672">scale 0</tspan></text><text transform="translate(718 512.15513)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".164" y="15" textLength="50.672">scale 1</tspan></text><text transform="translate(417.0625 512.776)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".064" y="15" textLength="35.872">input</tspan></text><text transform="translate(601.39136 411.09144)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".216" y="15" textLength="91.568">downsample</tspan></text><text transform="translate(556.198 327.1317)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".256" y="15" textLength="56">class pr</tspan><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x="55.968" y="15" textLength="49.776">edictor</tspan></text><text transform="translate(602 354.80285)" fill="black"><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x=".144" y="15" textLength="46.224">box pr</tspan><tspan font-family="Helvetica Neue" font-size="16" font-weight="500" x="46.08" y="15" textLength="49.776">edictor</tspan></text><path d="M 531 346.02685 L 531 454.02685 L 578.25 490.02685 L 578.25 382.02685 Z" fill="url(#Obj_Gradient_7)"/><path d="M 531 346.02685 L 531 454.02685 L 578.25 490.02685 L 578.25 382.02685 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><rect x="578.25" y="382.02685" width="15.75" height="108" fill="url(#Obj_Gradient_8)"/><rect x="578.25" y="382.02685" width="15.75" height="108" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/><path d="M 531 346.02685 L 578.25 382.02685 L 594 382.02685 L 549 346.02685 Z" fill="url(#Obj_Gradient_9)"/><path d="M 531 346.02685 L 578.25 382.02685 L 594 382.02685 L 549 346.02685 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/></g></g></svg>)\n", "\n", "We first use a `body` network to extract the image features, \n", - "which are used as the input to the first scale (scale 0). T\n", - "he class labels and the corresponding anchor boxes \n", + "which are used as the input to the first scale (scale 0). The class labels and the corresponding anchor boxes \n", "are predicted by `class_predictor` and `box_predictor`, respectively. \n", "We then downsample the representations to the next scale (scale 1). \n", "Again, at this new resolution, we predict both classes and anchor boxes. \n", @@ -657,7 +656,7 @@ " print('Found hash mismatch in file {}, possibly due to incomplete download.'.format(file_path))\n", " return matched\n", "\n", - "url_format = 'https://apache-mxnet.s3-accelerate.amazonaws.com/gluon/datasets/pikachu/{}'\n", + "url_format = 'https://apache-mxnet.s3-accelerate.amazonaws.com/gluon/dataset/pikachu/{}'\n", "hashes = {'train.rec': 'e6bcb6ffba1ac04ff8a9b1115e650af56ee969c8', \n", " 'train.idx': 'dcf7318b2602c06428b9988470c731621716c393', \n", " 'val.rec': 'd6c33f799b4d058e82f2cb5bd9a976f69d72d520'}\n", @@ -1299,7 +1298,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.4.3" + "version": "3.6.2" } }, "nbformat": 4, diff --git a/img/ssd.svg b/img/ssd.svg index be4b6dd..5da321a 100644 --- a/img/ssd.svg +++ b/img/ssd.svg @@ -1,3 +1,3 @@ - Produced by OmniGraffle 6.6.1 2017-08-15 21:47:34 +0000Canvas 1Layer 1class_preditorbox_predictorbodyscale 0scale 1inputdownsampleclass_preditorbox_predictor + Produced by OmniGraffle 6.6.1 2017-09-12 18:59:01 +0000Canvas 1Layer 1class predictorbox predictorbodyscale 0scale 1inputdownsampleclass predictorbox predictor