Вы находитесь на странице: 1из 43

<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"> <head> <meta http-equiv=Content-Type content="text/html; charset=windows-1252"> <meta name=ProgId content=Word.

Document> <meta name=Generator content="Microsoft Word 14"> <meta name=Originator content="Microsoft Word 14"> <link rel=File-List href="auth_proposal_files/filelist.xml"> <link rel=Edit-Time-Data href="auth_proposal_files/editdata.mso"> <!--[if !mso]> <style> v\:* {behavior:url(#default#VML);} o\:* {behavior:url(#default#VML);} w\:* {behavior:url(#default#VML);} .shape {behavior:url(#default#VML);} </style> <![endif]--> <title>Heritrix Negotiation of Authentication Schemes</title> <!--[if gte mso 9]><xml> <o:DocumentProperties> <o:Author>Charlie Pepper</o:Author> <o:LastAuthor>Charlie Pepper</o:LastAuthor> <o:Revision>3</o:Revision> <o:TotalTime>0</o:TotalTime> <o:Created>2013-06-04T16:50:00Z</o:Created> <o:LastSaved>2013-06-04T16:50:00Z</o:LastSaved> <o:Pages>2</o:Pages> <o:Words>3930</o:Words> <o:Characters>22401</o:Characters> <o:Company>Microsoft</o:Company> <o:Lines>186</o:Lines> <o:Paragraphs>52</o:Paragraphs> <o:CharactersWithSpaces>26279</o:CharactersWithSpaces> <o:Version>14.00</o:Version> </o:DocumentProperties> <o:OfficeDocumentSettings> <o:AllowPNG/> </o:OfficeDocumentSettings> </xml><![endif]--> <link rel=themeData href="auth_proposal_files/themedata.thmx"> <link rel=colorSchemeMapping href="auth_proposal_files/colorschememapping.xml"> <!--[if gte mso 9]><xml> <w:WordDocument> <w:TrackMoves>false</w:TrackMoves> <w:TrackFormatting/> <w:ValidateAgainstSchemas/> <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid> <w:IgnoreMixedContent>false</w:IgnoreMixedContent> <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText> <w:DoNotPromoteQF/> <w:LidThemeOther>EN-US</w:LidThemeOther> <w:LidThemeAsian>X-NONE</w:LidThemeAsian> <w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript> <w:Compatibility> <w:BreakWrappedTables/> <w:SplitPgBreakAndParaMark/>

</w:Compatibility> <m:mathPr> <m:mathFont m:val="Cambria Math"/> <m:brkBin m:val="before"/> <m:brkBinSub m:val="&#45;-"/> <m:smallFrac m:val="off"/> <m:dispDef/> <m:lMargin m:val="0"/> <m:rMargin m:val="0"/> <m:defJc m:val="centerGroup"/> <m:wrapIndent m:val="1440"/> <m:intLim m:val="subSup"/> <m:naryLim m:val="undOvr"/> </m:mathPr></w:WordDocument> </xml><![endif]--><!--[if gte mso 9]><xml> <w:LatentStyles DefLockedState="false" DefUnhideWhenUsed="true" DefSemiHidden="true" DefQFormat="false" DefPriority="99" LatentStyleCount="267"> <w:LsdException Locked="false" Priority="0" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="Normal"/> <w:LsdException Locked="false" Priority="9" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="heading 1"/> <w:LsdException Locked="false" Priority="9" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="heading 2"/> <w:LsdException Locked="false" Priority="9" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="heading 3"/> <w:LsdException Locked="false" Priority="9" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="heading 4"/> <w:LsdException Locked="false" Priority="9" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="heading 5"/> <w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 6"/> <w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 7"/> <w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 8"/> <w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 9"/> <w:LsdException Locked="false" Priority="39" Name="toc 1"/> <w:LsdException Locked="false" Priority="39" Name="toc 2"/> <w:LsdException Locked="false" Priority="39" Name="toc 3"/> <w:LsdException Locked="false" Priority="39" Name="toc 4"/> <w:LsdException Locked="false" Priority="39" Name="toc 5"/> <w:LsdException Locked="false" Priority="39" Name="toc 6"/> <w:LsdException Locked="false" Priority="39" Name="toc 7"/> <w:LsdException Locked="false" Priority="39" Name="toc 8"/> <w:LsdException Locked="false" Priority="39" Name="toc 9"/> <w:LsdException Locked="false" Priority="35" QFormat="true" Name="caption"/> <w:LsdException Locked="false" Priority="10" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="Title"/> <w:LsdException Locked="false" Priority="1" Name="Default Paragraph Font"/> <w:LsdException Locked="false" Priority="11" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="Subtitle"/> <w:LsdException Locked="false" Priority="22" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="Strong"/> <w:LsdException Locked="false" Priority="20" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="Emphasis"/> <w:LsdException Locked="false" Priority="59" SemiHidden="false" UnhideWhenUsed="false" Name="Table Grid"/> <w:LsdException Locked="false" UnhideWhenUsed="false" Name="Placeholder Text"/ > <w:LsdException Locked="false" Priority="1" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="No Spacing"/> <w:LsdException Locked="false" Priority="60" SemiHidden="false"

UnhideWhenUsed="false" Name="Light Shading"/> <w:LsdException Locked="false" Priority="61" SemiHidden="false" UnhideWhenUsed="false" Name="Light List"/> <w:LsdException Locked="false" Priority="62" SemiHidden="false" UnhideWhenUsed="false" Name="Light Grid"/> <w:LsdException Locked="false" Priority="63" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 1"/> <w:LsdException Locked="false" Priority="64" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 2"/> <w:LsdException Locked="false" Priority="65" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 1"/> <w:LsdException Locked="false" Priority="66" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 2"/> <w:LsdException Locked="false" Priority="67" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 1"/> <w:LsdException Locked="false" Priority="68" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 2"/> <w:LsdException Locked="false" Priority="69" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 3"/> <w:LsdException Locked="false" Priority="70" SemiHidden="false" UnhideWhenUsed="false" Name="Dark List"/> <w:LsdException Locked="false" Priority="71" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Shading"/> <w:LsdException Locked="false" Priority="72" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful List"/> <w:LsdException Locked="false" Priority="73" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Grid"/> <w:LsdException Locked="false" Priority="60" SemiHidden="false" UnhideWhenUsed="false" Name="Light Shading Accent 1"/> <w:LsdException Locked="false" Priority="61" SemiHidden="false" UnhideWhenUsed="false" Name="Light List Accent 1"/> <w:LsdException Locked="false" Priority="62" SemiHidden="false" UnhideWhenUsed="false" Name="Light Grid Accent 1"/> <w:LsdException Locked="false" Priority="63" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 1 Accent 1"/> <w:LsdException Locked="false" Priority="64" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 2 Accent 1"/> <w:LsdException Locked="false" Priority="65" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 1 Accent 1"/> <w:LsdException Locked="false" UnhideWhenUsed="false" Name="Revision"/> <w:LsdException Locked="false" Priority="34" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="List Paragraph"/> <w:LsdException Locked="false" Priority="29" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="Quote"/> <w:LsdException Locked="false" Priority="30" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="Intense Quote"/> <w:LsdException Locked="false" Priority="66" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 2 Accent 1"/> <w:LsdException Locked="false" Priority="67" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 1 Accent 1"/> <w:LsdException Locked="false" Priority="68" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 2 Accent 1"/> <w:LsdException Locked="false" Priority="69" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 3 Accent 1"/> <w:LsdException Locked="false" Priority="70" SemiHidden="false" UnhideWhenUsed="false" Name="Dark List Accent 1"/> <w:LsdException Locked="false" Priority="71" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Shading Accent 1"/> <w:LsdException Locked="false" Priority="72" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful List Accent 1"/>

<w:LsdException Locked="false" Priority="73" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Grid Accent 1"/> <w:LsdException Locked="false" Priority="60" SemiHidden="false" UnhideWhenUsed="false" Name="Light Shading Accent 2"/> <w:LsdException Locked="false" Priority="61" SemiHidden="false" UnhideWhenUsed="false" Name="Light List Accent 2"/> <w:LsdException Locked="false" Priority="62" SemiHidden="false" UnhideWhenUsed="false" Name="Light Grid Accent 2"/> <w:LsdException Locked="false" Priority="63" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 1 Accent 2"/> <w:LsdException Locked="false" Priority="64" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 2 Accent 2"/> <w:LsdException Locked="false" Priority="65" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 1 Accent 2"/> <w:LsdException Locked="false" Priority="66" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 2 Accent 2"/> <w:LsdException Locked="false" Priority="67" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 1 Accent 2"/> <w:LsdException Locked="false" Priority="68" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 2 Accent 2"/> <w:LsdException Locked="false" Priority="69" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 3 Accent 2"/> <w:LsdException Locked="false" Priority="70" SemiHidden="false" UnhideWhenUsed="false" Name="Dark List Accent 2"/> <w:LsdException Locked="false" Priority="71" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Shading Accent 2"/> <w:LsdException Locked="false" Priority="72" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful List Accent 2"/> <w:LsdException Locked="false" Priority="73" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Grid Accent 2"/> <w:LsdException Locked="false" Priority="60" SemiHidden="false" UnhideWhenUsed="false" Name="Light Shading Accent 3"/> <w:LsdException Locked="false" Priority="61" SemiHidden="false" UnhideWhenUsed="false" Name="Light List Accent 3"/> <w:LsdException Locked="false" Priority="62" SemiHidden="false" UnhideWhenUsed="false" Name="Light Grid Accent 3"/> <w:LsdException Locked="false" Priority="63" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 1 Accent 3"/> <w:LsdException Locked="false" Priority="64" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 2 Accent 3"/> <w:LsdException Locked="false" Priority="65" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 1 Accent 3"/> <w:LsdException Locked="false" Priority="66" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 2 Accent 3"/> <w:LsdException Locked="false" Priority="67" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 1 Accent 3"/> <w:LsdException Locked="false" Priority="68" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 2 Accent 3"/> <w:LsdException Locked="false" Priority="69" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 3 Accent 3"/> <w:LsdException Locked="false" Priority="70" SemiHidden="false" UnhideWhenUsed="false" Name="Dark List Accent 3"/> <w:LsdException Locked="false" Priority="71" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Shading Accent 3"/> <w:LsdException Locked="false" Priority="72" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful List Accent 3"/> <w:LsdException Locked="false" Priority="73" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Grid Accent 3"/> <w:LsdException Locked="false" Priority="60" SemiHidden="false" UnhideWhenUsed="false" Name="Light Shading Accent 4"/>

<w:LsdException Locked="false" Priority="61" SemiHidden="false" UnhideWhenUsed="false" Name="Light List Accent 4"/> <w:LsdException Locked="false" Priority="62" SemiHidden="false" UnhideWhenUsed="false" Name="Light Grid Accent 4"/> <w:LsdException Locked="false" Priority="63" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 1 Accent 4"/> <w:LsdException Locked="false" Priority="64" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 2 Accent 4"/> <w:LsdException Locked="false" Priority="65" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 1 Accent 4"/> <w:LsdException Locked="false" Priority="66" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 2 Accent 4"/> <w:LsdException Locked="false" Priority="67" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 1 Accent 4"/> <w:LsdException Locked="false" Priority="68" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 2 Accent 4"/> <w:LsdException Locked="false" Priority="69" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 3 Accent 4"/> <w:LsdException Locked="false" Priority="70" SemiHidden="false" UnhideWhenUsed="false" Name="Dark List Accent 4"/> <w:LsdException Locked="false" Priority="71" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Shading Accent 4"/> <w:LsdException Locked="false" Priority="72" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful List Accent 4"/> <w:LsdException Locked="false" Priority="73" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Grid Accent 4"/> <w:LsdException Locked="false" Priority="60" SemiHidden="false" UnhideWhenUsed="false" Name="Light Shading Accent 5"/> <w:LsdException Locked="false" Priority="61" SemiHidden="false" UnhideWhenUsed="false" Name="Light List Accent 5"/> <w:LsdException Locked="false" Priority="62" SemiHidden="false" UnhideWhenUsed="false" Name="Light Grid Accent 5"/> <w:LsdException Locked="false" Priority="63" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 1 Accent 5"/> <w:LsdException Locked="false" Priority="64" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 2 Accent 5"/> <w:LsdException Locked="false" Priority="65" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 1 Accent 5"/> <w:LsdException Locked="false" Priority="66" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 2 Accent 5"/> <w:LsdException Locked="false" Priority="67" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 1 Accent 5"/> <w:LsdException Locked="false" Priority="68" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 2 Accent 5"/> <w:LsdException Locked="false" Priority="69" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 3 Accent 5"/> <w:LsdException Locked="false" Priority="70" SemiHidden="false" UnhideWhenUsed="false" Name="Dark List Accent 5"/> <w:LsdException Locked="false" Priority="71" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Shading Accent 5"/> <w:LsdException Locked="false" Priority="72" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful List Accent 5"/> <w:LsdException Locked="false" Priority="73" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Grid Accent 5"/> <w:LsdException Locked="false" Priority="60" SemiHidden="false" UnhideWhenUsed="false" Name="Light Shading Accent 6"/> <w:LsdException Locked="false" Priority="61" SemiHidden="false" UnhideWhenUsed="false" Name="Light List Accent 6"/> <w:LsdException Locked="false" Priority="62" SemiHidden="false" UnhideWhenUsed="false" Name="Light Grid Accent 6"/>

<w:LsdException Locked="false" Priority="63" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 1 Accent 6"/> <w:LsdException Locked="false" Priority="64" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Shading 2 Accent 6"/> <w:LsdException Locked="false" Priority="65" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 1 Accent 6"/> <w:LsdException Locked="false" Priority="66" SemiHidden="false" UnhideWhenUsed="false" Name="Medium List 2 Accent 6"/> <w:LsdException Locked="false" Priority="67" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 1 Accent 6"/> <w:LsdException Locked="false" Priority="68" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 2 Accent 6"/> <w:LsdException Locked="false" Priority="69" SemiHidden="false" UnhideWhenUsed="false" Name="Medium Grid 3 Accent 6"/> <w:LsdException Locked="false" Priority="70" SemiHidden="false" UnhideWhenUsed="false" Name="Dark List Accent 6"/> <w:LsdException Locked="false" Priority="71" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Shading Accent 6"/> <w:LsdException Locked="false" Priority="72" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful List Accent 6"/> <w:LsdException Locked="false" Priority="73" SemiHidden="false" UnhideWhenUsed="false" Name="Colorful Grid Accent 6"/> <w:LsdException Locked="false" Priority="19" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="Subtle Emphasis"/> <w:LsdException Locked="false" Priority="21" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="Intense Emphasis"/> <w:LsdException Locked="false" Priority="31" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="Subtle Reference"/> <w:LsdException Locked="false" Priority="32" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="Intense Reference"/> <w:LsdException Locked="false" Priority="33" SemiHidden="false" UnhideWhenUsed="false" QFormat="true" Name="Book Title"/> <w:LsdException Locked="false" Priority="37" Name="Bibliography"/> <w:LsdException Locked="false" Priority="39" QFormat="true" Name="TOC Heading" /> </w:LatentStyles> </xml><![endif]--> <style> <!-/* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal {mso-style-unhide:no; mso-style-qformat:yes; mso-style-parent:""; margin:0in; margin-bottom:.0001pt; mso-pagination:widow-orphan; font-size:12.0pt; font-family:"Times New Roman","serif"; mso-fareast-font-family:"Times New Roman"; mso-fareast-theme-font:minor-fareast; color:black;} h2 {mso-style-priority:9; mso-style-unhide:no; mso-style-qformat:yes; mso-style-link:"Heading 2 Char"; mso-margin-top-alt:auto; margin-right:0in; mso-margin-bottom-alt:auto;

margin-left:0in; mso-pagination:widow-orphan; mso-outline-level:2; font-size:18.0pt; font-family:"Times New Roman","serif"; mso-fareast-font-family:"Times New Roman"; mso-fareast-theme-font:minor-fareast; color:black; font-weight:bold;} h3 {mso-style-priority:9; mso-style-unhide:no; mso-style-qformat:yes; mso-style-link:"Heading 3 Char"; mso-margin-top-alt:auto; margin-right:0in; mso-margin-bottom-alt:auto; margin-left:0in; mso-pagination:widow-orphan; mso-outline-level:3; font-size:13.5pt; font-family:"Times New Roman","serif"; mso-fareast-font-family:"Times New Roman"; mso-fareast-theme-font:minor-fareast; color:black; font-weight:bold;} h4 {mso-style-priority:9; mso-style-unhide:no; mso-style-qformat:yes; mso-style-link:"Heading 4 Char"; mso-margin-top-alt:auto; margin-right:0in; mso-margin-bottom-alt:auto; margin-left:0in; mso-pagination:widow-orphan; mso-outline-level:4; font-size:12.0pt; font-family:"Times New Roman","serif"; mso-fareast-font-family:"Times New Roman"; mso-fareast-theme-font:minor-fareast; color:black; font-weight:bold;} h5 {mso-style-priority:9; mso-style-unhide:no; mso-style-qformat:yes; mso-style-link:"Heading 5 Char"; mso-margin-top-alt:auto; margin-right:0in; mso-margin-bottom-alt:auto; margin-left:0in; mso-pagination:widow-orphan; mso-outline-level:5; font-size:10.0pt; font-family:"Times New Roman","serif"; mso-fareast-font-family:"Times New Roman"; mso-fareast-theme-font:minor-fareast; color:black; font-weight:bold;}

a:link, span.MsoHyperlink {mso-style-noshow:yes; mso-style-priority:99; color:blue; text-decoration:underline; text-underline:single;} a:visited, span.MsoHyperlinkFollowed {mso-style-noshow:yes; mso-style-priority:99; color:#840084; text-decoration:underline; text-underline:single;} p {mso-style-noshow:yes; mso-style-priority:99; mso-margin-top-alt:auto; margin-right:0in; mso-margin-bottom-alt:auto; margin-left:0in; mso-pagination:widow-orphan; font-size:12.0pt; font-family:"Times New Roman","serif"; mso-fareast-font-family:"Times New Roman"; mso-fareast-theme-font:minor-fareast; color:black;} tt {mso-style-noshow:yes; mso-style-priority:99; font-family:"Courier New"; mso-ascii-font-family:"Courier New"; mso-fareast-font-family:"Times New Roman"; mso-fareast-theme-font:minor-fareast; mso-hansi-font-family:"Courier New"; mso-bidi-font-family:"Courier New";} span.Heading2Char {mso-style-name:"Heading 2 Char"; mso-style-noshow:yes; mso-style-priority:9; mso-style-unhide:no; mso-style-locked:yes; mso-style-link:"Heading 2"; mso-ansi-font-size:13.0pt; mso-bidi-font-size:13.0pt; font-family:"Cambria","serif"; mso-ascii-font-family:Cambria; mso-ascii-theme-font:major-latin; mso-fareast-font-family:"Times New Roman"; mso-fareast-theme-font:major-fareast; mso-hansi-font-family:Cambria; mso-hansi-theme-font:major-latin; mso-bidi-font-family:"Times New Roman"; mso-bidi-theme-font:major-bidi; color:#4F81BD; mso-themecolor:accent1; font-weight:bold;} span.Heading3Char {mso-style-name:"Heading 3 Char"; mso-style-noshow:yes; mso-style-priority:9; mso-style-unhide:no;

mso-style-locked:yes; mso-style-link:"Heading 3"; mso-ansi-font-size:12.0pt; mso-bidi-font-size:12.0pt; font-family:"Cambria","serif"; mso-ascii-font-family:Cambria; mso-ascii-theme-font:major-latin; mso-fareast-font-family:"Times New Roman"; mso-fareast-theme-font:major-fareast; mso-hansi-font-family:Cambria; mso-hansi-theme-font:major-latin; mso-bidi-font-family:"Times New Roman"; mso-bidi-theme-font:major-bidi; color:#4F81BD; mso-themecolor:accent1; font-weight:bold;} span.firstname {mso-style-name:firstname; mso-style-unhide:no;} span.surname {mso-style-name:surname; mso-style-unhide:no;} span.orgname {mso-style-name:orgname; mso-style-unhide:no;} p.title, li.title, div.title {mso-style-name:title; mso-style-unhide:no; mso-margin-top-alt:auto; margin-right:0in; mso-margin-bottom-alt:auto; margin-left:0in; mso-pagination:widow-orphan; font-size:12.0pt; font-family:"Times New Roman","serif"; mso-fareast-font-family:"Times New Roman"; mso-fareast-theme-font:minor-fareast; color:black;} span.Heading4Char {mso-style-name:"Heading 4 Char"; mso-style-noshow:yes; mso-style-priority:9; mso-style-unhide:no; mso-style-locked:yes; mso-style-link:"Heading 4"; mso-ansi-font-size:12.0pt; mso-bidi-font-size:12.0pt; font-family:"Cambria","serif"; mso-ascii-font-family:Cambria; mso-ascii-theme-font:major-latin; mso-fareast-font-family:"Times New Roman"; mso-fareast-theme-font:major-fareast; mso-hansi-font-family:Cambria; mso-hansi-theme-font:major-latin; mso-bidi-font-family:"Times New Roman"; mso-bidi-theme-font:major-bidi; color:#4F81BD; mso-themecolor:accent1; font-weight:bold; font-style:italic;}

span.citation {mso-style-name:citation; mso-style-unhide:no;} span.abbrev {mso-style-name:abbrev; mso-style-unhide:no;} span.emphasis {mso-style-name:emphasis; mso-style-unhide:no;} span.Heading5Char {mso-style-name:"Heading 5 Char"; mso-style-noshow:yes; mso-style-priority:9; mso-style-unhide:no; mso-style-locked:yes; mso-style-link:"Heading 5"; mso-ansi-font-size:12.0pt; mso-bidi-font-size:12.0pt; font-family:"Cambria","serif"; mso-ascii-font-family:Cambria; mso-ascii-theme-font:major-latin; mso-fareast-font-family:"Times New Roman"; mso-fareast-theme-font:major-fareast; mso-hansi-font-family:Cambria; mso-hansi-theme-font:major-latin; mso-bidi-font-family:"Times New Roman"; mso-bidi-theme-font:major-bidi; color:#243F60; mso-themecolor:accent1; mso-themeshade:127;} span.title1 {mso-style-name:title1; mso-style-unhide:no;} span.edition {mso-style-name:edition; mso-style-unhide:no;} .MsoChpDefault {mso-style-type:export-only; mso-default-props:yes; font-size:10.0pt; mso-ansi-font-size:10.0pt; mso-bidi-font-size:10.0pt;} @page WordSection1 {size:8.5in 11.0in; margin:1.0in 1.0in 1.0in 1.0in; mso-header-margin:.5in; mso-footer-margin:.5in; mso-paper-source:0;} div.WordSection1 {page:WordSection1;} /* List Definitions */ @list l0 {mso-list-id:107966782; mso-list-template-ids:700897826;} @list l0:level1 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:.5in; mso-level-number-position:left; text-indent:-.25in;

@list

@list

@list

@list

@list

@list

@list

@list

mso-ansi-font-size:10.0pt; font-family:Symbol;} l0:level2 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:1.0in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l0:level3 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:1.5in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l0:level4 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:2.0in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l0:level5 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:2.5in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l0:level6 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:3.0in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l0:level7 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:3.5in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l0:level8 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:4.0in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l0:level9 {mso-level-number-format:bullet;

@list @list

@list

@list

@list

@list

@list

@list

mso-level-text:\F0B7; mso-level-tab-stop:4.5in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l1 {mso-list-id:1633170806; mso-list-template-ids:1220334686;} l1:level1 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:.5in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l1:level2 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:1.0in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l1:level3 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:1.5in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l1:level4 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:2.0in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l1:level5 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:2.5in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l1:level6 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:3.0in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} l1:level7 {mso-level-number-format:bullet; mso-level-text:\F0B7;

mso-level-tab-stop:3.5in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} @list l1:level8 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:4.0in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} @list l1:level9 {mso-level-number-format:bullet; mso-level-text:\F0B7; mso-level-tab-stop:4.5in; mso-level-number-position:left; text-indent:-.25in; mso-ansi-font-size:10.0pt; font-family:Symbol;} ol {margin-bottom:0in;} ul {margin-bottom:0in;} --> </style> <!--[if gte mso 10]> <style> /* Style Definitions */ table.MsoNormalTable {mso-style-name:"Table Normal"; mso-tstyle-rowband-size:0; mso-tstyle-colband-size:0; mso-style-noshow:yes; mso-style-priority:99; mso-style-parent:""; mso-padding-alt:0in 5.4pt 0in 5.4pt; mso-para-margin:0in; mso-para-margin-bottom:.0001pt; mso-pagination:widow-orphan; font-size:10.0pt; font-family:"Times New Roman","serif";} </style> <![endif]--><!--[if gte mso 9]><xml> <o:shapedefaults v:ext="edit" spidmax="1026"/> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext="edit"> <o:idmap v:ext="edit" data="1"/> </o:shapelayout></xml><![endif]--> </head> <body bgcolor=white lang=EN-US link=blue vlink="#840084" style='tab-interval: .5in' alink="#0000FF"> <div class=WordSection1> <h2><a name=N10001></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>2<o:p></o:p></span></h2>

<h2><span lang=EN style='mso-fareast-font-family:"Times New Roman";mso-ansi-lang uage: EN'><o:p>&nbsp;</o:p></span></h2> <h2><span lang=EN style='mso-fareast-font-family:"Times New Roman";mso-ansi-lang uage: EN'>Heritrix Negotiation of Authentication Schemes<o:p></o:p></span></h2> <div> <h3><i><span lang=EN style='mso-fareast-font-family:"Times New Roman"; mso-ansi-language:EN'>A Proposal to address RFE <a href="https://sourceforge.net/tracker/index.php?func=detail&amp;aid=914301&amp;g roup_id=73833&amp;atid=539102" target="_top">[ 914301 ] Logging in (HTTP POST, Basic Auth, etc.)</a></span></i> <span lang=EN style='mso-fareast-font-family:"Times New Roman";mso-ansi-language: EN'><o:p></o:p></span></h3> </div> <div> <div> <h3><span class=firstname><span lang=EN style='mso-fareast-font-family:"Times Ne w Roman"; mso-ansi-language:EN'>Michael</span></span><span lang=EN style='mso-fareast-font -family: "Times New Roman";mso-ansi-language:EN'> <span class=surname>Stack</span><o:p></ o:p></span></h3> <div> <p class=MsoNormal><span class=orgname><span lang=EN style='mso-fareast-font-fam ily: "Times New Roman";mso-ansi-language:EN'>Internet Archive</span></span><span lang=EN style='mso-fareast-font-family:"Times New Roman";mso-ansi-language: EN'><o:p></o:p></span></p> </div> </div> </div> <div class=MsoNormal align=center style='text-align:center'><span lang=EN style='mso-fareast-font-family:"Times New Roman";mso-ansi-language:EN'> <hr size=2 width="100%" align=center> </span></div> <div> <p><b><span lang=EN style='mso-ansi-language:EN'>Table of Contents</span></b><sp an lang=EN style='mso-ansi-language:EN'><o:p></o:p></span></p>

<p class=MsoNormal><span lang=EN style='mso-fareast-font-family:"Times New Roman "; mso-ansi-language:EN'>1. <a href="#N1001B">Introduction</a><o:p></o:p></span></p > <p class=MsoNormal style='margin-left:.5in'><span lang=EN style='mso-fareast-fon t-family: "Times New Roman";mso-ansi-language:EN'>1.1. <a href="#N10024">Scope</a><o:p></o :p></span></p> <p class=MsoNormal style='margin-left:.5in'><span lang=EN style='mso-fareast-fon t-family: "Times New Roman";mso-ansi-language:EN'>1.2. <a href="#N10037">Assumptions</a><o :p></o:p></span></p> <p class=MsoNormal><span lang=EN style='mso-fareast-font-family:"Times New Roman "; mso-ansi-language:EN'>2. <a href="#schemes">Authentication Schemes</a><o:p></o:p ></span></p> <p class=MsoNormal style='margin-left:.5in'><span lang=EN style='mso-fareast-fon t-family: "Times New Roman";mso-ansi-language:EN'>2.1. <a href="#basicdesc">Basic and Digest Access Authentication </a><o:p></o:p></span></p> <p class=MsoNormal style='margin-left:.5in'><span lang=EN style='mso-fareast-fon t-family: "Times New Roman";mso-ansi-language:EN'>2.2. <a href="#postdesc">HTTP POST and GET of Authentication Credentials</a><o:p></o:p></span></p> <p class=MsoNormal style='margin-left:.5in'><span lang=EN style='mso-fareast-fon t-family: "Times New Roman";mso-ansi-language:EN'>2.3. <a href="#clientcertdesc">X509 Client Certificates</a><o:p></o:p></span></p> <p class=MsoNormal style='margin-left:.5in'><span lang=EN style='mso-fareast-fon t-family: "Times New Roman";mso-ansi-language:EN'>2.4. <a href="#ntlmdesc">NTLM </a><o:p>< /o:p></span></p> <p class=MsoNormal><span lang=EN style='mso-fareast-font-family:"Times New Roman "; mso-ansi-language:EN'>3. <a href="#N100CD">Proposal</a><o:p></o:p></span></p> <p class=MsoNormal style='margin-left:.5in'><span lang=EN style='mso-fareast-fon t-family: "Times New Roman";mso-ansi-language:EN'>3.1. <a href="#N100ED">Basic and Digest Access Authentication </a><o:p></o:p></span></p> <p class=MsoNormal style='margin-left:.5in'><span lang=EN style='mso-fareast-fon t-family: "Times New Roman";mso-ansi-language:EN'>3.2. <a href="#N1016B">HTTP POST and GET of Authentication Credentials</a><o:p></o:p></span></p> <p class=MsoNormal style='margin-left:.5in'><span lang=EN style='mso-fareast-fon t-family: "Times New Roman";mso-ansi-language:EN'>3.3. <a href="#commonage">Commonage</a>< o:p></o:p></span></p> <p class=MsoNormal><span lang=EN style='mso-fareast-font-family:"Times New Roman

"; mso-ansi-language:EN'>4. <a href="#N10271">Design</a><o:p></o:p></span></p> <p class=MsoNormal style='margin-left:.5in'><span lang=EN style='mso-fareast-fon t-family: "Times New Roman";mso-ansi-language:EN'>4.1. <a href="#N10274">Configuration</a> <o:p></o:p></span></p> <p class=MsoNormal style='margin-left:.5in'><span lang=EN style='mso-fareast-fon t-family: "Times New Roman";mso-ansi-language:EN'>4.2. <a href="#N10279">Credential store< /a><o:p></o:p></span></p> <p class=MsoNormal><span lang=EN style='mso-fareast-font-family:"Times New Roman "; mso-ansi-language:EN'>5. <a href="#N10287">Future</a><o:p></o:p></span></p> <p class=MsoNormal style='margin-left:.5in'><span lang=EN style='mso-fareast-fon t-family: "Times New Roman";mso-ansi-language:EN'>5.1. <a href="#N1028C">Same URL different Page Content</a><o:p></o:p></span></p> <p class=MsoNormal style='margin-left:.5in'><span lang=EN style='mso-fareast-fon t-family: "Times New Roman";mso-ansi-language:EN'>5.2. <a href="#N10291">Integration with the UI</a><o:p></o:p></span></p> <p class=MsoNormal><span lang=EN style='mso-fareast-font-family:"Times New Roman "; mso-ansi-language:EN'><a href="#N10296">Bibliography</a><o:p></o:p></span></p> </div> <div> <p class=title><b><span lang=EN style='mso-ansi-language:EN'>Abstract</span></b> <span lang=EN style='mso-ansi-language:EN'><o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>Description of common web authentication schemes. Description of the problem volunteering credentials at the appropriate juncture. Proposal for navigating HTTP POST login and Basic Auth for when Heritrix has been supplied credentials ahead of the authorization challenge.<o:p></o:p></span></p> </div> <div> <div> <div> <div> <h2><a name=N1001B></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>1.&nbsp;Introduction<o:p></o:p></span></h2> </div>

</div> </div> <p><span lang=EN style='mso-ansi-language:EN'>This document is divided into two parts. The first part disccuses common web authentication schemes eliminating the less common. The second part outlines Heritrix negotiation of HTML login forms and Basic/Digest Auth authentications schemes. On the end are a list of items to consider for future versions of the authentication system.<o:p></o:p></ span></p> <p><span lang=EN style='mso-ansi-language:EN'>This intent of this document is to solicit feedback in advance of implementation.<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>The rest of this introduction is given over to scope and assumptions made in this document.<o:p></o:p></span></p> <div> <div> <div> <div> <h3><a name=N10024></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>1.1.&nbsp;Scope<o:p></o:p></span></h3> </div> </div> </div> <div> <div> <div> <div> <h4><a name=N10027></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>1.1.1.&nbsp;Delivery timeline<o:p></o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Delivery on the proposal is to be parcelled out over Heritrix versions. A first cut at Heritrix form-based POST/GET authentication is to be included in version 1.0 (End of April, 2004).<o :p></o:p></span></p> </div>

<div> <div> <div> <div> <h4><a name=N1002C></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>1.1.2.&nbsp;Common web authentication schemes only<o:p></o :p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>This proposal is for the common web authentication schemes only: E.g. HTTP POST to a HTML form, and Basic and Digest Auth. This proposal does not cover the Heritrix crawler authenticating against a LDAP server, PAM, getting tickets from a Kerberos server, negotiating single sign-ons, etc.<o:p></o:p></span></p> </div> <div> <div> <div> <div> <h4><a name=connbased></a><span lang=EN style='mso-fareast-font-family:"Times Ne w Roman"; mso-ansi-language:EN'>1.1.3.&nbsp;Connection-based authentication schemes<o:p></ o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Connection-based authentication schemes are outside the scope of this proposal. They are antithetical to the current Heritrix mode of operation. Consideration of connection-based authentication schemes is postponed until Heritrix does other than HTTP/1.0 behavior of getting a new connection per request.<o:p></o:p></span></p> </div> </div> <div> <div>

<div> <div> <h3><a name=N10037></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>1.2.&nbsp;Assumptions<o:p></o:p></span></h3> </div> </div> </div> <div> <div> <div> <div> <h4><a name=N1003A></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>1.2.1.&nbsp;Heritrix has been granted necessary authentication credentials<o:p></o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Assumption is that Heritrix has been granted legitimate access to the site we're trying to log into ahead of the login attempt; that the site owners have given permission and the necessary login/password combination and/or certificates necessary to gain access.<o:p></o :p></span></p> </div> <div> <div> <div> <div> <h4><a name=procchainassumption></a><span lang=EN style='mso-fareast-font-family : "Times New Roman";mso-ansi-language:EN'>1.2.2.&nbsp;Heritrix URI processing chain<o:p></o:p></span></h4> </div> </div> </div>

<p><span lang=EN style='mso-ansi-language:EN'>Assumption is that this proposal integrate with the Heritrix URI processing chains model [<span class=citation>Se e <a href="http://crawler.archive.org/user.html" target="_top">URI Processing Chains</a> </span>] rather than go to an authentication framework such as <a href="#jaas" target="_top">JAAS</a> and encapsulate the complete authentication dialog within a JAAS LoginModule plugin, with a plugin per authentication scheme supported. On the one hand, the Heritrix URI processing chain lends itself naturally to the processing of the common web authentication mechanisms with its core notions of HTML fetching and extracting, and besides, the authentication dialog will likely have links to harvest. On the other hand, authentication will be spread about the application.<o:p></o:p></span></p> </div> <div> <div> <div> <div> <h4><a name=N10050></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>1.2.3.&nbsp;No means of recording credentials used authenticating in an ARC<o:p></o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>There is no means currently for recording in an arc file the credentials used getting to pages (If we recorded the request, we'd have some hope of archiving them).<o:p></o:p></span></p> </div> <div> <div> <div> <div> <h4><a name=N10055></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>1.2.4.&nbsp;Credentials store does not need to be secured< o:p></o:p></span></h4> </div> </div> </div>

<p><span lang=EN style='mso-ansi-language:EN'>Assumption is that Heritrix does not need to secure the store in which we keep credentials to offer up during authentications; the credentials store does not need to be saved on disk encrypted and password protected.<o:p></o:p></span></p> </div> </div> </div> <div> <div> <div> <div> <h2><a name=schemes></a><span lang=EN style='mso-fareast-font-family:"Times New Roman"; mso-ansi-language:EN'>2.&nbsp;Authentication Schemes<o:p></o:p></span></h2> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>This section discusses common web authentication schemes and where applicable, practical issues navigating the schemes' requirements. The first two described, <a href="#basicdesc" title="2.1.&nbsp;Basic and Digest Access Authentication ">Section&nbsp;2.1, Basic and Digest Access Authentication </a> and <a href="#postdesc" title="2.2.&nbsp;HTTP POST and GET of Authentication Credentials">Section&nbsp;2 .2, HTTP POST and GET of Authentication Credentials </a>, are assumed most commonly used.<o:p></o:p></span></p> <div> <div> <div> <div> <h3><a name=basicdesc></a><span lang=EN style='mso-fareast-font-family:"Times Ne w Roman"; mso-ansi-language:EN'>2.1.&nbsp;Basic and Digest Access Authentication [<a href="#rfc2617" title="[rfc2617]">rfc2617</a>]<o:p></o:p></span></h3> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>The server returns a HTTP response code of </span><tt><span lang=EN style='font-size:10.0pt;mso-ansi-langu

age: EN'>401 Unauthorized</span></tt><span lang=EN style='mso-ansi-language:EN'> or < /span><tt><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>407 Proxy Authentication Required</span></tt><span lang=EN style='mso-ansi-language:EN'> when it requires authentiation of the client.<o:p></o:p></span></p> <div> <blockquote style='margin-top:5.0pt;margin-bottom:5.0pt'> <p><span lang=EN style='mso-ansi-language:EN'>The realm directive (case-insensitive) is required for all authentication schemes that issue a challenge. The realm value (case-sensitive), in combination with the canonical root URL...of the server being accessed, defines the protection space. [<a href="#rfc2617" title="[rfc2617]">rfc2617</a>]<o:p></o:p></span></p> </blockquote> </div> <p><span lang=EN style='mso-ansi-language:EN'>The canonical root URL is discussed in this message, <a href="http://cert.uni-stuttgart.de/archive/bugtraq/1999/08/msg00380.html" target="_top">Re: IE and cached passwords</a>. Its scheme + hostname + port only. Path and query string have been stripped. Effectively, it equates to scheme + <a href="http://java.sun.com/j2se/1.4.2/docs/api/java/net/URI.html" target="_top">URI authority.</a><o:p></o:p></span></p> <div> <blockquote style='margin-top:5.0pt;margin-bottom:5.0pt'> <p><span lang=EN style='mso-ansi-language:EN'>A client SHOULD assume that all paths at or deeper than the depth of the last symbolic element in the path field of the Request-URI also are within the protection space specified by the Basic realm value of the current challenge. A client MAY preemptively send the corresponding Authorization header with requests for resources in that space without receipt of another challenge from the server. [<a href="#rfc2617" title="[rfc2617]">rfc2617</a>]<o:p></o:p></span></p> </blockquote> </div> </div> <div> <div> <div> <div> <h3><a name=postdesc></a><span lang=EN style='mso-fareast-font-family:"Times New Roman"; mso-ansi-language:EN'>2.2.&nbsp;HTTP POST and GET of Authentication Credentials< o:p></o:p></span></h3>

</div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Generally, this scheme works as follows. When an unauthenticated client attempts to access a protected area, they are redirected by the server to a page with an HTML login form. The client must then HTTP POST or a HTTP GET the HTML form with the client access credentials filled in. Upon verification of the credentials by the server, the client is given access. So the client does not need to pass credentials on all subsequent accesses to the protected areas of the site, the server will mark the client usually in one of two ways: It will write a special, usually timeand scope-limited, token, or &quot;cookie&quot;, back to the client which the client volunteers on all subsequent accesses, or the server will serve pages that have embedded URLs rewritten to include a special token. The tokens are examined by the server on each subsequent access for validity and access continues while the token remains valid.<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>There is no standard for how this dialogue is supposed to proceed. Myriad are the implementations of this basic scheme. Below is a listing of common difficulties:<o:p></o:p></span></p> <div> <p style='margin-left:.5in;text-indent:-.25in;mso-list:l0 level1 lfo1; tab-stops:list .5in'><![if !supportLists]><span lang=EN style='font-size:10.0pt; mso-bidi-font-size:12.0pt;font-family:Symbol;mso-fareast-font-family:Symbol; mso-bidi-font-family:Symbol;mso-ansi-language:EN'><span style='mso-list:Ignore'>< span style='font:7.0pt "Times New Roman"'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;& nbsp; </span></span></span><![endif]><span lang=EN style='mso-ansi-language:EN'>Form field item names are varient.<o:p></o:p></span></p> <p style='margin-left:.5in;text-indent:-.25in;mso-list:l0 level1 lfo1; tab-stops:list .5in'><![if !supportLists]><span lang=EN style='font-size:10.0pt; mso-bidi-font-size:12.0pt;font-family:Symbol;mso-fareast-font-family:Symbol; mso-bidi-font-family:Symbol;mso-ansi-language:EN'><span style='mso-list:Ignore'>< span style='font:7.0pt "Times New Roman"'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;& nbsp; </span></span></span><![endif]><span lang=EN style='mso-ansi-language:EN'>Means by which unsuccessful login is reported to the client varies. A client can be redirected to new failed login page or the original login page is redrawn with the inclusion of banner message reporting on the failed login.<o:p></o:p></span> </p> <p style='margin-left:.5in;text-indent:-.25in;mso-list:l0 level1 lfo1; tab-stops:list .5in'><![if !supportLists]><span lang=EN style='font-size:10.0pt; mso-bidi-font-size:12.0pt;font-family:Symbol;mso-fareast-font-family:Symbol; mso-bidi-font-family:Symbol;mso-ansi-language:EN'><span style='mso-list:Ignore'>< span style='font:7.0pt "Times New Roman"'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;& nbsp; </span></span></span><![endif]><span lang=EN style='mso-ansi-language:EN'>Follow ing on from the previous point, should a solution POST authentication and then do al l

necessary to ensure a successful login -- i.e. follow redirects, regex over the result page to ensure it says &quot;successful login&quot;, etc. -- or should a solution do nought but POST and then give whatever the resultant page to the Heritrix URI processing chain whether successful or not?<o:p></o:p></span></p> <div style='margin-left:.5in;margin-right:.5in'> <h3 style='margin-left:.5in'><span lang=EN style='mso-fareast-font-family:"Times New Roman"; mso-ansi-language:EN'>Processing of form success page?<o:p></o:p></span></h3> <p style='margin-left:.5in'><span lang=EN style='mso-ansi-language:EN'>The result page should probably be let through. It may have valuable links on board. The alternative would necessitate our running an out-of-band subset of the Heritrix URI processing chain POSTing/GETting authentication running extractors to verify result of login attempt. This mini authentication chain could be kept tidy encapsulated within a login module -- see <a href="#procchainassumption" title="1.2.2.&nbsp;Heritrix URI processing chain">Se ction&nbsp;1.2.2, Heritrix URI processing chain </a>-- but ugly would be how to transfer such as the cookies from the mini chain over to the main URI processing chain.<o:p></o:p ></span></p> </div> <p style='margin-left:.5in;text-indent:-.25in;mso-list:l0 level1 lfo1; tab-stops:list .5in'><![if !supportLists]><span lang=EN style='font-size:10.0pt; mso-bidi-font-size:12.0pt;font-family:Symbol;mso-fareast-font-family:Symbol; mso-bidi-font-family:Symbol;mso-ansi-language:EN'><span style='mso-list:Ignore'>< span style='font:7.0pt "Times New Roman"'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;& nbsp; </span></span></span><![endif]><span lang=EN style='mso-ansi-language:EN'>The aforementioned differing ways in which the server parks in the client a validated token.<o:p></o:p></span></p> <p style='margin-left:.5in;text-indent:-.25in;mso-list:l0 level1 lfo1; tab-stops:list .5in'><![if !supportLists]><span lang=EN style='font-size:10.0pt; mso-bidi-font-size:12.0pt;font-family:Symbol;mso-fareast-font-family:Symbol; mso-bidi-font-family:Symbol;mso-ansi-language:EN'><span style='mso-list:Ignore'>< span style='font:7.0pt "Times New Roman"'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;& nbsp; </span></span></span><![endif]><span lang=EN style='mso-ansi-language:EN'>What if login attempt fails? Should we retry? For how long? Means maintaining a state across URI processing?<o:p></o:p></span></p> <p style='margin-left:.5in;text-indent:-.25in;mso-list:l0 level1 lfo1; tab-stops:list .5in'><![if !supportLists]><span lang=EN style='font-size:10.0pt; mso-bidi-font-size:12.0pt;font-family:Symbol;mso-fareast-font-family:Symbol; mso-bidi-font-family:Symbol;mso-ansi-language:EN'><span style='mso-list:Ignore'>< span style='font:7.0pt "Times New Roman"'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;& nbsp; </span></span></span><![endif]><span lang=EN style='mso-ansi-language:EN'>Should there be tools to help an operator develop Heritrix authentication configuration? Should a tool be developed that runs the login outside of the Heritrix context to make it easier on operator developing the authentication configuration?<o:p></o:p></span></p>

</div> </div> <div> <div> <div> <div> <h3><a name=clientcertdesc></a><span lang=EN style='mso-fareast-font-family: "Times New Roman";mso-ansi-language:EN'>2.3.&nbsp;X509 Client Certificates<o:p>< /o:p></span></h3> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>To gain access, the client must volunteer a trusted certificate setting up an SSL connection to the server. Upon receipt, the server tests the client is entitled to access.<o:p></o:p></spa n></p> <p><span lang=EN style='mso-ansi-language:EN'>Its probably rare that client certificates alone will be used as access protection. More likely, certificates will be used in combination with one of the above listed schemes.<o:p></o:p></sp an></p> <p><span lang=EN style='mso-ansi-language:EN'>The certificate the client is to volunteer needs to be in a local TrustStore available to the Heritrix TrustManager making the SSL connection (Heritrix already maintains its own keystore of certificates to use verifying server proffered certs).<o:p></o:p></s pan></p> <div style='margin-left:.5in;margin-right:.5in'> <h3><span lang=EN style='mso-fareast-font-family:"Times New Roman";mso-ansi-lang uage: EN'>Testing<o:p></o:p></span></h3> <p><span lang=EN style='mso-ansi-language:EN'>Test to see if certificates are volunteered even in case where we're running in open trust mode. Test to see how hard to append a host-particular keystore to the general Heritrix keystore at runtime.<o:p></o:p></span></p> </div> </div> <div> <div> <div> <div>

<h3><a name=ntlmdesc></a><span lang=EN style='mso-fareast-font-family:"Times New Roman"; mso-ansi-language:EN'>2.4.&nbsp;NTLM [<a href="#ntlm" title="[ntlm]"><span class=abbrev>ntlm</span></a>]<o:p></o:p></span></h3> </div> </div> </div> <div> <blockquote style='margin-top:5.0pt;margin-bottom:5.0pt'> <p><span lang=EN style='mso-ansi-language:EN'>NTLM is...a proprietary protocol designed by Microsoft with no publicly available specification. Early version of NTLM were less secure than Digest authentication due to faults in the design, however these were fixed in a service pack for Windows NT 4 and the protocol is now considered more secure than Digest authentication... There are some significant differences in the way that NTLM works compared with basic and digest authentication...NTLM authenticates a connection and not a request, so you need to authenticate every time a new connection is made and keeping the connection open during authentication is vital. Due to this, NTLM cannot be used to authenticate with both a proxy and the server, nor can NTLM be used with HTTP 1.0 connections or servers that do not support HTTP keep-alives. [<a href="#httpclient" title="[httpclient]"><span class=abbrev>httpclient</span></a> ]<o:p></o:p></span></p> </blockquote> </div> <p><span lang=EN style='mso-ansi-language:EN'>The NTLM is put outside the scope of this proposal because its nature is antithetical to how Heritrix works: i.e. It authenticates the connection, not a session [<span class=citation>Also see <a href="#connbased" title="1.1.3.&nbsp;Connection-based authentication schemes">Se ction&nbsp;1.1.3, Connection-based authentication schemes </a> </span>]. Related, the implementation is incomplete in httpclient. NTLM will not be discussed further.< o:p></o:p></span></p> </div> </div> <div> <div> <div> <div> <h2><a name=N100CD></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.&nbsp;Proposal<o:p></o:p></span></h2> </div>

</div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Proposal is to put off implementation of client-side certificates in Heritrix. Rare is the case where its needed.<o:p></o:p></span></p> <div style='margin-left:.5in;margin-right:.5in'> <h3><span lang=EN style='mso-fareast-font-family:"Times New Roman";mso-ansi-lang uage: EN'>Workaround?<o:p></o:p></span></h3> <p><span lang=EN style='mso-ansi-language:EN'>It should be possible to just add the client certificate to the local truststore and all would just work. Test.<o: p></o:p></span></p> </div> <p><span lang=EN style='mso-ansi-language:EN'>Having cut <a href="#ntlmdesc" title="2.4.&nbsp;NTLM ">Section&nbsp;2.4, NTLM </a> and <a href="#clientcertdesc" title="2.3.&nbsp;X509 Client Certificates">Section&nbsp;2 .3, X509 Client Certificates </a>, we're left with <a href="#basicdesc" title="2.1.&nbsp;Basic and Digest Access Authentication ">Section&nbsp;2.1, Basic and Digest Access Authentication </a> and <a href="#postdesc" title="2.2.&nbsp;HTTP POST and GET of Authentication Credentials">Section&nbsp;2 .2, HTTP POST and GET of Authentication Credentials </a>, the assumed most commonly used web authentication schemes.<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>Reading in the above, <a href="#schemes" title="2.&nbsp;Authentication Schemes">Section&nbsp;2, Authentication Schemes </a>, it may be apparent that there can not be one solution that will work for both schemes. The discussion in the following two sections -- a section per scheme under consideration -- should bring this fact out and help identify facility common to the two schemes detailed later in <a href="#commonage" title="3.3.&nbsp;Commonage">Section&nbsp;3.3, Commonage </a>.<o:p ></o:p></span></p> <div> <div> <div> <div> <h3><a name=N100ED></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.1.&nbsp;Basic and Digest Access Authentication [<a href="#rfc2617" title="[rfc2617]">rfc2617</a>]<o:p></o:p></span></h3> </div> </div> </div>

<p><span lang=EN style='mso-ansi-language:EN'>A basic implementation would, upon receipt of a 401 response status code, extract a realm from the 401 response and use this </span><tt><i><span lang=EN style='font-size:10.0pt; mso-ansi-language:EN'>realm + URI canonical root URL</span></i></tt><span lang=EN style='mso-ansi-language:EN'> as a compound key to do a look up into a store of Basic/Digest Auth credentials. If a match is found, the <em>persistent domain/virtualdomain object</em> made for the current domain is loaded with the discovered credentials and the 401'ing current URI is marked for retry (If no matching credentials found, the current URI is marked failed with a 401 response code).<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>Let it be a given that any rfc2617 credentials found in a <em>persistent domain/virtualdomain object</em> always get always loaded into the HTTP GET request.<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>When our 401'ing URI comes around again for retry, since credentials were loaded the last time this URI was seen, credentials will be found in the <em>persistent domain/virtualdomain object</em> and will be added to the request headers. This time around the authentication should succeed.<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>Any other URI that is a member of this realm will also subsequently successfully authenticate given the above rule whereby we always load any found credentials into the current request.<o:p> </o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>Let the above be the default behavior. Configurations would enable/disable:<o:p></o:p></span></p> <div> <p style='margin-left:.5in;text-indent:-.25in;mso-list:l1 level1 lfo2; tab-stops:list .5in'><![if !supportLists]><span lang=EN style='font-size:10.0pt; mso-bidi-font-size:12.0pt;font-family:Symbol;mso-fareast-font-family:Symbol; mso-bidi-font-family:Symbol;mso-ansi-language:EN'><span style='mso-list:Ignore'>< span style='font:7.0pt "Times New Roman"'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;& nbsp; </span></span></span><![endif]><span lang=EN style='mso-ansi-language:EN'>Enable /Disable this feature.<o:p></o:p></span></p> <p style='margin-left:.5in;text-indent:-.25in;mso-list:l1 level1 lfo2; tab-stops:list .5in'><a name=preemptiveauth></a><![if !supportLists]><span lang=EN style='font-size:10.0pt;mso-bidi-font-size:12.0pt;font-family:Symbol; mso-fareast-font-family:Symbol;mso-bidi-font-family:Symbol;mso-ansi-language: EN'><span style='mso-list:Ignore'><span style='font:7.0pt "Times New Roman"'>&nbs p;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; </span></span></span><![endif]><span lang=EN style='mso-ansi-language:EN'>Pre-po pulation of the <em>persistent domain/virtualdomain object</em> with all rfc2617 credentials upon construction thereby avoiding 401s altogether since we'd be sending all credentials in advance of any challenge (preemptive authentication). A domain might have many rfc2617 realms. Preemptive authentication would have us volunteering all of a domains realms' credentials in each request.<o:p></o:p></span></p> <p style='margin-left:.5in'><span lang=EN style='mso-ansi-language:EN'>The query of the store pre-populating the <em>persistent domain/virtualdomain

object</em> would use the </span><tt><i><span lang=EN style='font-size:10.0pt; mso-ansi-language:EN'>URI canonical root URL</span></i></tt><span lang=EN style='mso-ansi-language:EN'> for a key.<o:p></o:p></span></p> <p style='margin-left:.5in'><span lang=EN style='mso-ansi-language:EN'>This configuration could be set globally for all Heritrix requests or per </span><tt> <i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>URI canonical root URL</sp an></i></tt><span lang=EN style='mso-ansi-language:EN'> by setting a property on the corresponding record in the store.<o:p></o:p></span></p> <p style='margin-left:.5in;text-indent:-.25in;mso-list:l1 level1 lfo2; tab-stops:list .5in'><![if !supportLists]><span lang=EN style='font-size:10.0pt; mso-bidi-font-size:12.0pt;font-family:Symbol;mso-fareast-font-family:Symbol; mso-bidi-font-family:Symbol;mso-ansi-language:EN'><span style='mso-list:Ignore'>< span style='font:7.0pt "Times New Roman"'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;& nbsp; </span></span></span><![endif]><span lang=EN style='mso-ansi-language:EN'>Upon receipt of a 401 and on successfully locating appropriate credentials in the store (or already loaded in the <em>persistent domain/virtualdomain object</em>) , configuration could enable immediately retrying the request rather than letting the 401 percolate down through the Heritrix processing chain and back up out of the Frontier (Enabling this configuration would leave no trace of the 401 in the ARC).<o:p></o:p></span></p> </div> <p><span lang=EN style='mso-ansi-language:EN'>The simplest implementation would have us always do <a href="preemptiveauth" target="_top">preemptive authentication</a>. Configuration would turn this feature on or off, and that'd be all.<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>Below we look with more detail at aspects of the above proposed implementation.<o:p></o:p></span></p> <div> <div> <div> <div> <h4><a name=N10131></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.1.1.&nbsp;CrawlServer<o:p></o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>In Heritrix, the <em>persistent domain/virtualdomain object</em> is </span><tt><span lang=EN style='font-size: 10.0pt;mso-ansi-language:EN'><a href="http://crawler.archive.org/xref/org/archive/crawler/datamodel/CrawlServer.

html" target="_top">org.archive.crawler.datamodel.CrawlServer</a></span></tt><span lang=EN style='mso-ansi-language:EN'>. Its created inside in <a href="http://crawler.archive.org/xref/org/archive/crawler/basic/Frontier.html" target="_top">org.archive.crawler.basic.Frontier#next()</a> if no extant CrawlServer is found in the <a href="org.archive.crawler.datamodel.ServerCache" target="_top">org.archive.crawler.datamodel.ServerCache</a>. The lookup is done using a (decoded) <a href="http://java.sun.com/j2se/1.4.2/docs/api/java/net/URI.html" target="_top">U RI authority</a>. The currently processed URI has easy access to its corresponding CrawlServer. See <a href="http://crawler.archive.org/xref/org/archive/crawler/datamodel/CrawlURI.htm l" target="_top">CrawlURI#getServer()</a>.<o:p></o:p></span></p> </div> <div> <div> <div> <div> <h4><a name=N1014E></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.1.2.&nbsp;HTTPClient<o:p></o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>HTTPClient has builtin support for Basic, Digest and NTLM. It takes care of sending appropriate Authentication headers.<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>Digest Authentication generally works but has a ways to go according to the comment made on 2004-03-11 16:21 in <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=27594" target="_top"> Wrong reauthentication when using DigestAuthentication</a><o:p></o:p></span></p> <div style='margin-left:.5in;margin-right:.5in'> <h3><span lang=EN style='mso-fareast-font-family:"Times New Roman";mso-ansi-lang uage: EN'>Multiple Realms<o:p></o:p></span></h3> <p><span lang=EN style='mso-ansi-language:EN'>What to do if host has multiple realms? Will HTTPClient [<a href="#httpclient" title="[httpclient]"><span class=abbrev>httpclient</span></a>] do right thing and offer all credentials available appropriately? Need to test.<o:p></o:p></span></p> </div> <p><span lang=EN style='mso-ansi-language:EN'>The HTTPClient authentication

code was just refactored extensively in HEAD -- post 2.0 release. Reported problems authenticating via a proxy going over SSL.<o:p></o:p></span></p> </div> <div> <div> <div> <div> <h4><a name=N10162></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.1.3.&nbsp;RFC2617 Record<o:p></o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>A RFC2617 record would be keyed by </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>URI canonical root URL</span></i></tt><span lang=EN style='mso-ansi-language:EN'>. It would contain a realm, login and password. We'd not distingush proxy (407) records.<o:p></o:p></span></p> </div> </div> <div> <div> <div> <div> <h3><a name=N1016B></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.2.&nbsp;HTTP POST and GET of Authentication Credentials< o:p></o:p></span></h3> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Every URI processed by Heritrix first has preconditions checked. Example preconditions are the fetching of a domain's DNS record and its </span><tt><span lang=EN style='font-size:10.0pt; mso-ansi-language:EN'>robots.txt</span></tt><span lang=EN style='mso-ansi-langua ge: EN'> file before proceeding to make requests against the domain. This proposal is to add a new <em>login precondition</em> after the fashion of the robots and DNS preconditions -- See <a

href="org.archive.crawler.prefetch.PreconditionEnforcer" target="_top">org.archi ve.crawler.prefetch.PreconditionEnforcer</a> -- and a facility for having our HTTP fetcher run a configurable one time login.<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>The new </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login precondition</span>< /i></tt><span lang=EN style='mso-ansi-language:EN'> will test the current URI against a preloaded list of <em>login URI patterns</em>. Each </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login URI pattern </span>< /i></tt><span lang=EN style='mso-ansi-language:EN'>describes a protected area of a domain (or virtualdomain): e.g. &quot;http://www.archive.org/private/*&quot;. Each </span>< tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login URI pattern</span></ i></tt><span lang=EN style='mso-ansi-language:EN'> serves as a key to an associated <em>login record</em>. A </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-lang uage: EN'>login record</span></i></tt><span lang=EN style='mso-ansi-language:EN'> has all information necessary for negotiation of a successful login such as the HTML form content to submit -- username, password, submit button name, etc. -and whether login requires POSTing or GETting the login form. The login record also has a <em>ran login</em> flag that says whether or not the login has been run previously against this protected area.<o:p></o:p></span></p> <div style='margin-left:.5in;margin-right:.5in'> <h3><span lang=EN style='mso-fareast-font-family:"Times New Roman";mso-ansi-lang uage: EN'>Ran Login flag<o:p></o:p></span></h3> <p><span lang=EN style='mso-ansi-language:EN'>The </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>ran login</span></i></tt><span lang=EN style='mso-ansi-language:EN'> flag says whether the login has been <em>r un</em>, not whether or not login <em>succeeded</em>. Guaging whether the login was successful or not is difficult. It varies with the login implementation as already noted.<o:p></o:p></span></p> </div> <p><span lang=EN style='mso-ansi-language:EN'>Also part of the login record is a <em>login URI</em>. The </span><tt><i><span lang=EN style='font-size:10.0pt; mso-ansi-language:EN'>login URI</span></i></tt><span lang=EN style='mso-ansi-lan guage: EN'> is the login page whose successful navigation gives access to the protected space: e.g. If the pattern we used testing was, &quot;http://www.archive.org/private/*&quot;, the </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login URI</span></i></tt><span lang=EN style='mso-ansi-language:EN'> might be &quot;http://www.archive.org/private/login.html&quot;.<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>If the current URI matches one of the </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>lo gin URI pattern</span></i></tt><span lang=EN style='mso-ansi-language:EN'> list, we pull the matched patterns associated </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login record</span></i></tt><span

lang=EN style='mso-ansi-language:EN'>. If the </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>ran login</span></i></tt><span lang=EN style='mso-ansi-language:EN'> flag has not been set, the </span><tt><i>< span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login URI</span></i></tt>< span lang=EN style='mso-ansi-language:EN'> is <em>force</em> queued. Its force queued in case the URI has been seen (GET'd) already. The </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login URI</span></i></tt>< span lang=EN style='mso-ansi-language:EN'> (somehow) has the </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login record</span></i></t t><span lang=EN style='mso-ansi-language:EN'> associated. The presence of the </span><tt ><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login record</span></i></t t><span lang=EN style='mso-ansi-language:EN'> distingushes the </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login URI</span></i></tt>< span lang=EN style='mso-ansi-language:EN'>. The current URI is requeued (Precondition not met). Otherwise the current URI is let run through as per normal.<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>When the </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login URI</span></i></tt>< span lang=EN style='mso-ansi-language:EN'> becomes the current URI and is being processed by the HTTP fetcher, the presence of the </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login record</span></i></tt><span lang=EN style='mso-ansi-language:EN'> with a </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>ran login</span></i></tt><span lang=EN style='mso-ansi-language:EN'> set to false signals the HTTP fetcher to run the abnormal login sequence rather than do its usual GET. The </span><tt><i> <span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login record</span></i></t t><span lang=EN style='mso-ansi-language:EN'> has all the HTTP fetcher needs to execute the login. Upon completion, the </span><tt><i><span lang=EN style='font-size: 10.0pt;mso-ansi-language:EN'>login ran</span></i></tt><span lang=EN style='mso-ansi-language:EN'> flag is set in the </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login record</span></i></tt><span lang=EN style='mso-ansi-language:EN'> and the </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login record</span></i></tt><span lang=EN style='mso-ansi-language:EN'> is removed from the </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login URI</span></i></tt>< span lang=EN style='mso-ansi-language:EN'>.<o:p></o:p></span></p> <div style='margin-left:.5in;margin-right:.5in'> <h3><span lang=EN style='mso-fareast-font-family:"Times New Roman";mso-ansi-lang uage: EN'>GET of the login URI<o:p></o:p></span></h3> <p><span lang=EN style='mso-ansi-language:EN'>What if we haven't already seen the login page? Should the login precondition first force fetch the login URI without the login record loaded so its first GET'd before the we run a login?<o: p></o:p></span></p>

</div> <p><span lang=EN style='mso-ansi-language:EN'>This implementation cannot guarantee successful login nor is there provision for retries. The general notion is that the single running of the login succeeds and that the produced success cookie or rewritten URI makes it back to the Heritrix client gaining us access to the protected area.<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>Configuration would enable or disable this feature.<o:p></o:p></span></p> <div> <div> <div> <div> <h4><a name=N10202></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.2.1.&nbsp;Login Record<o:p></o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>A login record would be keyed by the pattern it applies to and would contain aforementioned </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>ran login</span></i></tt>< span lang=EN style='mso-ansi-language:EN'> flag and </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>login URI</span></i></tt><span lang=EN style='mso-ansi-language:EN'>. Tied to the login URI would be a list of key-value pairs to hold the login form content as well as specification of whether the form is to be POSTed or GETed.<o:p></o:p></span></p> </div> </div> <div> <div> <div> <div> <h3><a name=commonage></a><span lang=EN style='mso-fareast-font-family:"Times Ne w Roman"; mso-ansi-language:EN'>3.3.&nbsp;Commonage<o:p></o:p></span></h3> </div> </div> </div>

<p><span lang=EN style='mso-ansi-language:EN'>Here we discuss features common to the two above authentication scheme implementations.<o:p></o:p></span></p> <div> <div> <div> <div> <h4><a name=N10215></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.3.1.&nbsp;URI#authority as URI canonical root URL<o:p></ o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Proposal is to equate the two. Doing so means no need to change CrawlServer. Currently the CawlServer is constructed wrapping the URI#authority portion of an URI. URI#authority is </spa n><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>URI canonical root URL</sp an></i></tt><span lang=EN style='mso-ansi-language:EN'> absent the scheme. Assuming CrawlServer is for http only, then it should be safe making this equation.<o:p></o:p></span> </p> <div style='margin-left:.5in;margin-right:.5in'> <h3><span lang=EN style='mso-fareast-font-family:"Times New Roman";mso-ansi-lang uage: EN'>DNS<o:p></o:p></span></h3> <p><span lang=EN style='mso-ansi-language:EN'>Are there CrawlServer instances made for anything but http schemes?<o:p></o:p></span></p> </div> <div style='margin-left:.5in;margin-right:.5in'> <h3><span lang=EN style='mso-fareast-font-family:"Times New Roman";mso-ansi-lang uage: EN'>HTTPS<o:p></o:p></span></h3> <p><span lang=EN style='mso-ansi-language:EN'>Check that </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>URI canonical root URL</sp an></i></tt><span lang=EN style='mso-ansi-language:EN'>s of </span><tt><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>http://www.example.com</span></tt> <span lang=EN style='mso-ansi-language:EN'> and </span><tt><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>https://www.example.com</span></tt ><span lang=EN style='mso-ansi-language:EN'> result in different </span><tt><span

lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>CrawlServer</span></tt><sp an lang=EN style='mso-ansi-language:EN'> instances.<o:p></o:p></span></p> </div> </div> <div> <div> <div> <div> <h4><a name=N10237></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.3.2.&nbsp;Population of Domain/VirtualDomain object with Credentials<o:p></o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Proposal is that CrawlServer encapsulate credentials store accessing, that it read the store upon construction.<o:p></o:p></span></p> </div> <div> <div> <div> <div> <h4><a name=N1023C></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.3.3.&nbsp;Caching of Credentials<o:p></o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Once read from the store, we need to cache the credentials in CrawlServer.<o:p></o:p></span></p> <div> <div> <div>

<div> <h5><a name=N10241></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.3.3.1.&nbsp;JAAS Subject, Principal and Credentials [<a href="#jaas" title="[jaas]"><span class=abbrev>jaas</span></a>]<o:p></o:p></span ></h5> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Proposal is that we at least look at selectively exploiting this library caching credentials. For example, a CrawlServer might implement the java.security.auth.Subject interface. To this Subject, we'd add implementations of the Principals and Credentials interfaces (Makes sense for the carrying of RFC2617 credentials. Less so for login credentials. TBD).<o:p></o:p></span></p> </div> </div> <div> <div> <div> <div> <h4><a name=store></a><span lang=EN style='mso-fareast-font-family:"Times New Ro man"; mso-ansi-language:EN'>3.3.4.&nbsp;Credential Stores<o:p></o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>The credential store would be on disk.<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>For convenience, particularly listing credentials in a global file store, credentials can be grouped first by host (the base domain -- domain minus port #) and then by URI#authority (domain plus any port #).<o:p></o:p></span></p> <p><span lang=EN style='mso-ansi-language:EN'>Configuration would allow us to point at a global store of credentials.<o:p></o:p></span></p> <div> <div> <div>

<div> <h5><a name=N10252></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.3.4.1.&nbsp;Layering of Credential Stores<o:p></o:p></sp an></h5> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Subsequently, we'd add support for <em>layering</em> stores. Modeled after apache's </span><tt><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>.htaccess</span></tt><span lang=EN style='mso-ansi-language:EN'> mechanism for selectively overriding the main server configuration on a directory scope, or, closer to home, on how Heritrix settings can be overridden on a per-host basis, it'd be possible to point the store querying code at a directory whose subdirectories are named for domains progressing from a root down through the macro level org, com, gov, etc., subdomains getting progressively more precise: e.g travel.yahoo.com would be found under the yahoo.com directory which would be under the com directory. Searching for credentials, we'd search up through the directory structure going from the current domain on up to the root. </span><tt><i><span lang=EN style='font-size:10.0pt;mso-ansi-language:EN'>realm + canonical root URL</span>< /i></tt><span lang=EN style='mso-ansi-language:EN'> key. If not found in the domain store, of if a domain store did not exist, we'd back up the settings hierarchy until we hit the global store.<o:p></o:p></span></p> </div> <div> <div> <div> <div> <h5><a name=N10262></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.3.4.2.&nbsp;Exploit the settings framework implementing credentials store<o:p></o:p></span></h5> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Propose extending or adapting the Heritrix settings framework to have it manage our credentials store so we can exploit code already written.<o:p></o:p></span></p> </div> </div>

<div> <div> <div> <div> <h4><a name=N10267></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.3.5.&nbsp;Logging<o:p></o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>A new log will trace authentication transactions. Log will include listing of credentials offered, new cookies, query parameters, and pertinent HTTP headers returned by the submitted authentication, and where possible, report on whether authentication succeeded or not<o:p></o:p></span></p> </div> <div> <div> <div> <div> <h4><a name=N1026C></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>3.3.6.&nbsp;Debugging tool<o:p></o:p></span></h4> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>A command-line tool to run single logins to aid debugging logins will aid development and be of use to operators.< o:p></o:p></span></p> </div> </div> </div> <div> <div> <div>

<div> <h2><a name=N10271></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>4.&nbsp;Design<o:p></o:p></span></h2> </div> </div> </div> <div> <div> <div> <div> <h3><a name=N10274></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>4.1.&nbsp;Configuration<o:p></o:p></span></h3> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Will add to the HTTP Fetcher options that enable, disable and configuration of the two authentication types supported.<o:p></o:p></span></p> </div> <div> <div> <div> <div> <h3><a name=N10279></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>4.2.&nbsp;Credential store<o:p></o:p></span></h3> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Below is a static class model diagram for accessing the credential store.<o:p></o:p></span></p> <div> <p class=MsoNormal><span style='mso-fareast-font-family:"Times New Roman";

mso-no-proof:yes'><img border=0 width=32 height=32 id="_x0000_i1026" src=credentials.gif></span><span lang=EN style='mso-fareast-font-family:"Times N ew Roman"; mso-ansi-language:EN'><o:p></o:p></span></p> </div> <div style='margin-left:.5in;margin-right:.5in'> <h3><span lang=EN style='mso-fareast-font-family:"Times New Roman";mso-ansi-lang uage: EN'>Implementation looks nothing like the above<o:p></o:p></span></h3> <p><span lang=EN style='mso-ansi-language:EN'>Ignore the above design. The implementation turned out to be something else altogether. The model was effectively inverted (credentials hold domains) and notions of going via a CredentialManager/CredentialStore to do all operations on the store were removed. While the resultant implementation is not a good OOM, its amenable to UI manipulation (and sits easily atop the heritrix settings system).<o:p></o:p>< /span></p> </div> </div> </div> <div> <div> <div> <div> <h2><a name=N10287></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>5.&nbsp;Future<o:p></o:p></span></h2> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>This section has issues to be addressed later, probably in a version 2.0 of the authentication system.<o:p></o :p></span></p> <div> <div> <div> <div> <h3><a name=N1028C></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>5.1.&nbsp;Same URL different Page Content<o:p></o:p></span

></h3> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Heritrix distingushes pages by URIs. Pages seen can be different whether logged in or not. We'll need some way to force/suggest sets of URIs are revisitable after a login token is received. This might mean the 'fingerprint' of a URI includes any authentication information to be used.<o:p></o:p></span></p> </div> <div> <div> <div> <div> <h3><a name=N10291></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>5.2.&nbsp;Integration with the UI<o:p></o:p></span></h3> </div> </div> </div> <p><span lang=EN style='mso-ansi-language:EN'>Add/Edit/Delete of Credentials via the UI. Flagging the operator about 401s and likely html login forms.<o:p></ o:p></span></p> </div> </div> <div id=N10296> <div> <div> <div> <h2><a name=N10296></a><span lang=EN style='mso-fareast-font-family:"Times New R oman"; mso-ansi-language:EN'>Bibliography<o:p></o:p></span></h2> </div> </div> </div>

<div> <p><a name=heritrix></a><span lang=EN style='mso-ansi-language:EN'>[<span class=abbrev>heritrix</span>] <span class=title1><i><a href="http://crawler.archive.org" target="_top">Heritrix is the Internet Archive's open-source, extensible, web-scale, archival-quality web crawler project.</a></i>. </span><o:p></o:p></span></p> </div> <div> <p><a name=httpclient></a><span lang=EN style='mso-ansi-language:EN'>[<span class=abbrev>httpclient</span>] <span class=title1><i>Apache Jakarta Commons HTTPClient <a href="http://jakarta.apache.org/commons/httpclient/authentication.html" target="_top">Authentication Guide</a></i>. </span><span class=edition>Commons HTTPClient version 2.0.. </span><o:p></o:p></span></p> </div> <div> <p><a name=jaas></a><span lang=EN style='mso-ansi-language:EN'>[<span class=abbrev>jaas</span>] <span class=title1><i><a href="http://java.sun.com/products/jaas/index.jsp" target="_top">Java Authentication and Authorization Service (JAAS)</a></i>. </span><o:p></o:p></spa n></p> </div> <div> <p><a name=ntlm></a><span lang=EN style='mso-ansi-language:EN'>[<span class=abbrev>ntlm</span>] <span class=title1><i>The <a href="http://davenport.sourceforge.net/ntlm.html" target="_top">NTLM Authentication Protocol</a></i>. </span><o:p></o:p></span></p> </div> <div> <p><a name=rfc2617></a><span lang=EN style='mso-ansi-language:EN'>[rfc2617] <spa n class=title1><i>RFC2617 <a href="http://ftp.ics.uci.edu/pub/ietf/http/rfc2617.txt" target="_top">HTTP Authentication: Basic and Digest Access Authentication</a></i>. </span><o:p></o: p></span></p> </div> </div> </div> </body> </html>

Вам также может понравиться