--- a/nbbuild/antsrc/org/netbeans/nbbuild/CheckLinks.java Thu Apr 30 00:29:04 2009 +0400 +++ a/nbbuild/antsrc/org/netbeans/nbbuild/CheckLinks.java Thu Apr 30 12:32:42 2009 +0400 @@ -45,13 +45,11 @@ import java.net.*; import java.util.*; import java.util.regex.*; - import org.apache.tools.ant.BuildException; import org.apache.tools.ant.FileScanner; import org.apache.tools.ant.Project; import org.apache.tools.ant.Task; import org.apache.tools.ant.taskdefs.MatchingTask; - import org.apache.tools.ant.types.Mapper; // XXX in Ant 1.6, permit entries to make checking of "external" links @@ -120,6 +118,7 @@ return m; } + @Override public void execute () throws BuildException { if (basedir == null) throw new BuildException ("Must specify the basedir attribute"); FileScanner scanner = getDirectoryScanner (basedir); @@ -155,6 +154,196 @@ private static Pattern hrefOrAnchor = Pattern.compile("<(a|img)(\\s+shape=\"rect\")?\\s+(href|name|src)=\"([^\"#]*)(#[^\"]+)?\"(\\s+shape=\"rect\")?\\s*/?>", Pattern.CASE_INSENSITIVE); private static Pattern lineBreak = Pattern.compile("^", Pattern.MULTILINE); + + /** + * The state of the Check Links process. + */ + protected static class State { + /** + * Ant task to associate with the CheckLinks + */ + protected Task task; + /** + * Global ClassLoader for the NetBeans module under test. + * May be null if it called from + * CheckHelpSets at the development time. + * + * @see CheckHelpSetsBin#globalClassLoadercreateGlobalClassLoader(File dir, String [] files) + */ + protected ClassLoader globalClassLoader; + /** + * ClassLoader map for the NetBeans module under test. + * May be null if it called from + * CheckHelpSets at the development time. + * + * @see CheckHelpSetsBin#createClassLoaderMap (File dir, String [] files) + */ + protected Map classLoaderMap; + /** + * The id string obtained from the javax.help.Map.ID . + */ + protected String referrer; + /** + * The referrer file path (or full URL if not file:) + * It is the empty string for the external calls from both + * CheckHelpSets and CheckHelpSetsBin. + * This is used only in error messages, but not in the bussiness logic. + * @see the local variable basepath in the private method + * CheckLinks.scan(...). + */ + protected String referrerLocation; + /** + * The URI to check + */ + protected URI u; + /** + * The set of URIs known to be fully checked (including all anchored + * variants etc.) + */ + protected Set okurls; + /** + * The set of URIs known to be bogus. + */ + protected Set badurls; + /** + * The set of (base) URIs known to have had their contents checked. + */ + protected Set cleanurls; + /** + * If true, check external links (all protocols besides + * file:) + */ + protected boolean checkexternal; + /** + * If true then it is an error in the case of using a space + * in the URI strings instead of the "%20". + */ + protected boolean checkspaces; + /** + * If true then State.filters will be taken + * into account. + * @see State#filters + */ + protected boolean checkforbidden; + /** + * one of: + *
    + *
  • 0 - just check that it can be opened;
  • + *
  • 1 - check also that any links from it can be opened;
  • + *
  • 2 - recurse
  • + *
+ */ + protected int recurse; + /** + * The list of Mappers to apply to get source files from HTML files. + */ + protected List mappers; + /** + * It seems this filter list is always empty! + */ + protected List filters; + /** + * The list of strings containing error messages that may be displayed + * for a user. + */ + protected List errors; + + /** + * Creates a state of the CheckLinks process. + * + * @param task an Ant task to associate with this + * @param globalClassLoader Global ClassLoader for the NetBeans module + * under test. + * @param classLoaderMap ClassLoader map for the NetBeans module under + * test. + * @param referrer the referrer file path (or full URL if not file:) + * @param referrerLocation the location in the referrer, e.g. ":38:12", + * or "" if unavailable + * @param u the URI to check + * @param okurls a set of URIs known to be fully checked (including all + * anchored variants etc.) + * @param badurls a set of URIs known to be bogus + * @param cleanurls a set of (base) URIs known to have had their + * contents checked + * @param checkexternal if true, check external links (all protocols + * besides file:) + * @param checkspaces If true then it is an error in the + * case of using a space in the URI strings instead of the "%20". + * @param checkforbidden If true then + * State.filters will be taken into account. + * @param recurse one of: + * 0 - just check that it can be opened; + * 1 - check also that any links from it can be opened; + * 2 - recurse + * @param mappers a list of Mappers to apply to get source files from + * HTML files + * @param filters + * @param errors + */ + public State(Task task, ClassLoader globalClassLoader, + Map classLoaderMap, String referrer, + String referrerLocation, URI u, + Set okurls, Set badurls, Set cleanurls, + boolean checkexternal, boolean checkspaces, + boolean checkforbidden, int recurse, + List mappers, List filters, + List errors) { + this.task = task; + this.globalClassLoader = globalClassLoader; + this.classLoaderMap = classLoaderMap; + this.referrer = referrer; + this.referrerLocation = referrerLocation; + this.u = u; + this.okurls = okurls; + this.badurls = badurls; + this.cleanurls = cleanurls; + this.checkexternal = checkexternal; + this.checkspaces = checkspaces; + this.checkforbidden = checkforbidden; + this.recurse = recurse; + this.mappers = mappers; + this.filters = filters; + this.errors = errors; + } + + private State(State state) { + this.task = state.task; + this.globalClassLoader = state.globalClassLoader; + this.classLoaderMap = state.classLoaderMap; + this.referrer = state.referrer; + this.referrerLocation = state.referrerLocation; + this.u = state.u; + this.okurls = state.okurls; + this.badurls = state.badurls; + this.cleanurls = state.cleanurls; + this.checkexternal = state.checkexternal; + this.checkspaces = state.checkspaces; + this.checkforbidden = state.checkforbidden; + this.recurse = state.recurse; + this.mappers = state.mappers; + this.filters = state.filters; + this.errors = state.errors; + } + + @Override + public State clone() { + return new State(this); + } + + } // State + + /** + * Scan for broken links. + * @param s a state of the CheckLinks process. + * @throws java.io.IOException + */ + public static void scan(CheckLinks.State s) throws IOException { + // System.out.println("CheckLinks.scan u: " + s.u); + scan(s.task, s.globalClassLoader, s.classLoaderMap, s.referrer, + s.referrerLocation, s.u, s.okurls, s.badurls, s.cleanurls, + s.checkexternal, s.checkspaces, s.checkforbidden, s.recurse, + s.mappers, s.filters, s.errors); + } /** * Scan for broken links. @@ -179,7 +368,9 @@ boolean checkexternal, boolean checkspaces, boolean checkforbidden, int recurse, List mappers, List errors) throws IOException { scan (task, globalClassLoader, classLoaderMap, - referrer, referrerLocation, u, okurls, badurls, cleanurls, checkexternal, checkspaces, checkforbidden, recurse, mappers, Collections.emptyList(), errors); + referrer, referrerLocation, u, okurls, badurls, cleanurls, + checkexternal, checkspaces, checkforbidden, recurse, mappers, + Collections.emptyList(), errors); } private static void scan @@ -545,6 +736,14 @@ } // else we are only checking that this one has right anchors } } + // Process HTML Object Elements + CheckLinks.State state = + new State(task, globalClassLoader, classLoaderMap, basepath, + referrerLocation, u, okurls, badurls, cleanurls, + checkexternal, checkspaces, checkforbidden, recurse, + mappers, filters, errors); + HTMLObjectElementsChecker coe = new HTMLObjectElementsChecker(state); + coe.check(content); } else { task.log("Not checking contents of " + base, Project.MSG_VERBOSE); } --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ 0d6086c24c93 Thu Apr 30 12:32:42 2009 +0400 @@ -0,0 +1,867 @@ +/* + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * + * The contents of this file are subject to the terms of either the GNU + * General Public License Version 2 only ("GPL") or the Common + * Development and Distribution License("CDDL") (collectively, the + * "License"). You may not use this file except in compliance with the + * License. You can obtain a copy of the License at + * http://www.netbeans.org/cddl-gplv2.html + * or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the + * specific language governing permissions and limitations under the + * License. When distributing the software, include this License Header + * Notice in each file and include the License file at + * nbbuild/licenses/CDDL-GPL-2-CP. Sun designates this + * particular file as subject to the "Classpath" exception as provided + * by Sun in the GPL Version 2 section of the License file that + * accompanied this code. If applicable, add the following below the + * License Header, with the fields enclosed by brackets [] replaced by + * your own identifying information: + * "Portions Copyrighted [year] [name of copyright owner]" + * + * If you wish your version of this file to be governed by only the CDDL + * or only the GPL Version 2, indicate your decision by adding + * "[Contributor] elects to include this software in this distribution + * under the [CDDL or GPL Version 2] license." If you do not indicate a + * single choice of license, a recipient has the option to distribute + * your version of this file under either the CDDL, the GPL Version 2 or + * to extend the choice of license to its licensees as provided above. + * However, if you add GPL Version 2 code and therefore, elected the GPL + * Version 2 license, then the option applies only if the new code is + * made subject to such option by the copyright holder. + * + * Contributor(s): + * + * Portions Copyrighted 2009 Sun Microsystems, Inc. + */ + +package org.netbeans.nbbuild; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.text.MessageFormat; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.tools.ant.Project; + +/** + * Checker for the <object> elements. + * The <object> elements may be used by the HelpSet authors in the HTML + * sources. + * By default the checker is aware about the following classes of the + * <object> elements: + *
    + *
  • java:org.netbeans.modules.javahelp.BrowserDisplayer + * Testable URI value is contained in a <param> element with the + * name content. + *
  • + *
  • java:com.sun.java.help.impl.JHSecondaryViewer + * Testable URI value is contained in a <param> element with the + * name content. + *
  • + *
+ * + * @version 1.0 + * + * @see + * Issue #117506 + * @see + * JavaHelp - Help Set Checkers + * + * @author Victor G. Vasilyev + * + * TODO: + * - TBD It will be better to move all the help set checkers to the separate + * package org.netbeans.nbbuild.helpsets + * - Getting a name list of the <object> properties as an input to provide + * checking of their values for correctness of URLs. + * The list should be passed via Ant property. + * - Getting a <object> classid list as an input to provide + * exclusion of this object from checking. Take the list into account in the + * method check(String content). The list should be passed via Ant property. + */ +public class HTMLObjectElementsChecker { + + /** + * The separator used in the identifiers of the <param> elements + * to separate classid from a name of the <param> + * element. + * + * @see #getParamID(java.lang.String, java.lang.String) + */ + public static final String PARAM_NAME_SEPARATOR = "."; + + private static final String ERROR_PREFIX = "HTML element error: "; + + /** + * The pattern for the message: + * "The element without classid attribute." + * where: + *
    + *
  • {0} - a URI string.
  • + *
  • {1} - a string value of the offset of the buggy text in the HTML + * file.
  • + *
  • {2} - a buggy text.
  • + *
+ */ + private static final String ERROR1 = + "The element without classid attribute. \n" + + "file:\n" + + "{0}\n" + + "offset: {1}\n" + + "text: \n" + + "{2}"; + + /** + * The pattern for the message: + * "The element with unknown classid:" + * where: + *
    + *
  • {0} - a string value of the classid attrute.
  • + *
  • {1} - a URI string.
  • + *
  • {2} - a string value of the offset of the buggy text in the HTML + * file.
  • + *
  • {3} - a buggy text.
  • + *
+ */ + private static final String ERROR2 = + "The element with unknown classid: \n" + + "{0}\n" + + "file:\n" + + "{1}\n" + + "offset: {2}\n" + + "text: \n" + + "{3}"; + + /** + * The pattern for the message: + * "The <param> element with unknown name." + * where: + *
    + *
  • {0} - a text of <param> element.
  • + *
  • {1} - a URI string.
  • + *
  • {2} - a string value of the offset of the buggy text in the HTML + * file.
  • + *
  • {3} - a buggy text.
  • + *
+ */ + private static final String ERROR3 = + "The element with unknown name. \n" + + "{0}\n" + + "file:\n" + + "{1}\n" + + "offset: {2}\n" + + "text: \n" + + "{3}"; + + /** + * The pattern for the message: + * "The <param> element with unknown value." + * where: + *
    + *
  • {0} - a text of <param> element.
  • + *
  • {1} - a URI string.
  • + *
  • {2} - a string value of the offset of the buggy text in the HTML + * file.
  • + *
  • {3} - a buggy text.
  • + *
+ */ + private static final String ERROR4 = + "The element with unknown value. \n" + + "{0}\n" + + "file:\n" + + "{1}\n" + + "offset: {2}\n" + + "text: \n" + + "{3}"; + + /** + * The pattern for the message: + * "The <param> element with unknown value." + * where: + *
    + *
  • {0} - a text of <param> element.
  • + *
  • {1} - a URI string.
  • + *
  • {2} - a string value of the offset of the buggy text in the HTML + * file.
  • + *
  • {3} - a buggy text.
  • + *
  • {4} - a URISyntaxException text
  • + *
+ */ + private static final String ERROR5 = + "The element has incorrect URI value. \n" + + "exception: {4}\n" + + "{0}\n" + + "file:\n" + + "{1}\n" + + "offset: {2}\n" + + "text: \n" + + "{3}\n"; + + private CheckLinks.State state; + + private List knownClassIDs = new ArrayList(); + + private List knownURIParams = new ArrayList(); + { + addTestableURIParam( + "java:org.netbeans.modules.javahelp.BrowserDisplayer.content"); + addTestableURIParam( + "java:com.sun.java.help.impl.JHSecondaryViewer.content"); + } + + /** + * Constructs HTMLObjectElementsChecker associated with the + * specified CheckLinks process. + * @param task - The state of the CheckLinks process. + */ + public HTMLObjectElementsChecker(CheckLinks.State state) { + this.state = state; + } + + /** + * Excludes class of the <object> elements from testing by registering + * the specifyed classid as a known classid for this checker. + * @param classid - a value of the classid attribute + * used in the <object> elements of that class. + */ + public void excludeObjectClass(String classid) { + knownClassIDs.add(classid); + } + + /** + * Registers the specified paramid as a URI Param that will be + * tested. + *
+ *

+     * paramid := classid PARAM_NAME_SEPARATOR paramName
+     * paramName is a value of the name attribute of the param element nested to
+     * the object element with the classid attribute whose value is used in the
+     * paramid :-)
+     * 
+ * @param paramid - The id of the PARAM element associated with the classid + * of the outer OBJECT element. + */ + public void addTestableURIParam(String paramid) { + knownURIParams.add(paramid); + int dotPos = paramid.lastIndexOf(PARAM_NAME_SEPARATOR); + String classid = paramid.substring(0, dotPos); + knownClassIDs.add(classid); + } + + /** + * Checks the specified HTML content. + * The HTML content may contain the <object> elements + * that will be checked. + * + * @param content - The HTML content under test. + */ + public void check(String content) { + URI uri = state.u; + antLog("Check HTML elements in " + uri, Project.MSG_VERBOSE); + HTML html = new HTML(content); + // process OBJECT elements: + Iterable oi = html.getObjects(); + for(HTML.Object e : oi) { + HTML.Attribute a = e.getAttribute("classid"); + if(a == null) { + error(ERROR1, uri, e.getLineColumn(), e.getText()); + continue; + } + String classid=a.getValue(); + if(!knownClassIDs.contains(classid)) { + error(ERROR2, a.getValue(), uri, e.getLineColumn(), + e.getText()); + continue; + } + // TODO: May be check the known against its XML Schema, + // but when the HTML text should be normalized before. + + // process nested PARAM elements + Iterable pi = e.getParams(); + for(HTML.Object.Param p : pi) { + // process attributes of the PARAM element:; + HTML.Attribute paName = p.getAttribute("name"); + if(paName == null) { + error(ERROR3, p.getText(), uri, e.getLineColumn(), + e.getText()); + continue; + } + String paNameValue = paName.getValue(); + String paramid = getParamID(classid, paNameValue); + if(!knownURIParams.contains(paramid)) { + continue; + } + // check URI defined in the param element. + antLog("Check HTML element with URI type: " + paramid, + Project.MSG_VERBOSE); + + HTML.Attribute paValue = p.getAttribute("value"); + if(paValue == null) { + error(ERROR4, p.getText(), uri, e.getLineColumn(), + e.getText()); + continue; + } + String testableURI = paValue.getValue(); + CheckLinks.State s = state.clone(); + s.recurse = 0; + s.referrer = testableURI; + try { + s.u = new URI(testableURI); + } catch (URISyntaxException ex) { + error(ERROR5, p.getText(), uri, e.getLineColumn(), + e.getText(), ex.toString()); + } + try { + CheckLinks.scan(s); + } catch (IOException ex) { + ex.printStackTrace(); + } + } // for(HTML.Object.Param p : pi) + } // for(HTML.Object e : oi) + + } + + private String getParamID(String classid, String paramName) { + return classid + PARAM_NAME_SEPARATOR + paramName; + } + + /** + * Shows Ant log message. + * @param s - The message string. + * @param level - The level of the message, e.g Project.MSG_VERBOSE + * + * @see org.apache.tools.ant.Project + */ + private void antLog(String s, int level) { + state.task.log(s, level); + } + + /** + * Adds the specified error message to the error list associated with the + * CheckLinks.State. An associated Ant task will show these + * messages at the completion. The ERROR_PREFIX will be added + * before each message. + * + * @param s - The error message. + */ + private void error(String s) { + state.errors.add(ERROR_PREFIX + s); + } + + /** + * Adds the specified error message to the error list associated with the + * CheckLinks.State. An associated Ant task will show these + * messages at the completion. + *

The error message is specified by the given pattern and + * uses it to format the given arguments.

+ *

The ERROR_PREFIX will be added before each message.

+ * + * @param pattern - The pattern of the message. + * @param arguments - The arguments of the message. + */ + private void error(String pattern, Object... arguments) { + error(MessageFormat.format(pattern, arguments)); + } + + + /** + * HTML parser that helps to find <object> elements, nested + * <param> elements and their attributes in the specified HTML + * source. + *
+ * Usage: + *

+     * HTML html = new HTML(htmlText);
+     * // process OBJECT elements:
+     * Iterable oi = html.getObjects();
+     * for(HTML.Object e : oi) {
+     *     int start=e.getStart(); // start position of the element
+     *     int end=e.getEnd(); // end position of the element
+     *     String text = e.getText(); // text of the element
+     *     // ...
+     *     // process attributes of the OBJECT element:;
+     *     Iterable ai = e.getAttributes();
+     *     for(HTML.Attribute a : ai) {
+     *         String name=a.getName(); // name of the attribute
+     *         String value=a.getValue(); // value of the attribute
+     *         // ...
+     *     }
+     *     // process nested PARAM elements
+     *     Iterable pi = e.getParams();
+     *     for(HTML.Object.Param p : pi) {
+     *         String ptext = p.getText(); // text of the element
+     *         // process attributes of the PARAM element:;
+     *         Iterable pai = p.getAttributes();
+     *         for(HTML.Attribute pa : pai) {
+     *             String name=pa.getName(); // name of the attribute
+     *             String value=pa.getValue(); // value of the attribute
+     *             // ...
+     *         }
+     *     }
+     * }
+     * 
+ */ + public static class HTML { + + private static Pattern lineBreak = + Pattern.compile("^", Pattern.MULTILINE); + + /** + * SGML source where tokens are valid comment delimiters. + */ + protected String source; + + /** + * + * @param source + */ + public HTML(String source) { + this.source = source; + } + + /** + * Returns an instance of Iterable object for the + * <object> elements contained in the HTML source. + * @return an Iterable object. + */ + public Iterable getObjects() { + @SuppressWarnings("unchecked") // the cast is correct + final Iterator ei = new HTML.Object(this); + return new Iterable() { + public Iterator iterator() { + return ei; + } + }; + } + + + /** + * Abstract HTML construction. + */ + public abstract class Construction implements Iterator { + + /** + * Match flags, a bit mask that may include + * Pattern.CASE_INSENSITIVE, Pattern.MULTILINE, Pattern.DOTALL, + * Pattern.UNICODE_CASE, and Pattern.CANON_EQ + */ + private int matchFlags; + + private String regex; + + private Matcher m; + + /** + * Creates instance of the HTML + * @param regex + * @param matchFlags + */ + protected Construction(String regex, int matchFlags) { + this.regex = regex; + this.matchFlags = matchFlags; + Pattern p = getPattern(); + m = p.matcher(source); + } + + /** + * Returns Matcher associated with the Construction. + * @return Matcher for the Construction. + */ + protected Matcher getMatcher() { + return m; + } + + /** + * Returns Pattern associated with the Construction. + * @return Pattern for the Construction. + */ + public Pattern getPattern() { + return Pattern.compile(regex, matchFlags); + } + + /** + * Returns text of the Construction. + * @return text of the Construction. + */ + public String getText() { + return getMatcher().group(); + } + + /** + * Checks whether this HTML.Element is not commented + * out. + * @return false if comment start is before element, but end is not. + */ + protected boolean isCommentedOut() { + int pos = getMatcher().start(); + int commentStart = source.lastIndexOf (Comment.START, pos); + int commentEnd = source.lastIndexOf (Comment.END, pos); + if(commentStart == -1) { // i.e. ... elem ... + return false; + } + if(commentEnd == -1) { + return true; // i.e. ... ... elem ... + } + + /** + * Returns the start index of the Construction. + * + * @return The index of the first character of the + * Construction. + * @throws IllegalStateException - If no match has yet + * been attempted, or if the previous match operation failed. + */ + public int getStart() { + return getMatcher().start(); + } + + /** + * Returns the offset after the last character of the + * Construction. + * + * @return The offset after the last character of the + * Construction. + * @throws IllegalStateException - If no match has yet + * been attempted, or if the previous match operation failed. + */ + public int getEnd() { + return getMatcher().end(); + } + + /** + * Return the start offset of the Construction as a + * string in form: Line:Column. + * @return a line:col string. + */ + public String getLineColumn() { + Matcher lbm = lineBreak.matcher(source); + int line = 0; + int col = 1; + int pos = getStart(); + while (lbm.find()) { + if (lbm.start() <= pos) { + line++; + col = pos - lbm.start() + 1; + } else { + break; + } + } + return line + ":" + col; + } + + public boolean hasNext() { + while(getMatcher().find()) { + if(!isCommentedOut()) { + return true; + } + } + return false; + } + + public Construction next() { + return this; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + } // Element + + /** + * Commented block. + */ + public class Comment { + /** + * Start token of the HTML commented block. + */ + public static final String START = ""; // NOI18N + } + + /** + * HTML Construction with attributes. + */ + public class AttributableConstruction extends Construction { + + private int attListGroup; + + /** + * Creates AttributableConstruction. + * @param regex - The regular expression associated with the + * AttributableConstruction. + * @param matchFlags + * @param attListGroup + */ + public AttributableConstruction(String regex, + int matchFlags, int attListGroup) { + super(regex, matchFlags); + this.attListGroup = attListGroup; + } + + /** + * Returns an Iterable object for the set of attributes + * associated whit the HTML element. + * + * @return an Iterable object. + */ + public Iterable getAttributes() { + int startAttList = getMatcher().start(attListGroup); + int endAttList = getMatcher().end(attListGroup); + @SuppressWarnings("unchecked") // the cast is correct + final Iterator i = new HTML.Attribute(startAttList, + endAttList); + return new Iterable() { + public Iterator iterator() { + return i; + } + }; + } + + /** + * Returns an object attribute with the specified name. + * @param name - The name of the object attribute. + * @return The object attribute if exists, otherwise + * null. + */ + public HTML.Attribute getAttribute(String name) { + Iterable ai = getAttributes(); + for(HTML.Attribute a : ai) { + if(name.equalsIgnoreCase(a.getName())) { + return a; + } + } + return null; + } + } // AttributableConstruction + + /** + * The HTML object element. + */ + public class Object extends AttributableConstruction { + + /** + * Regular expression for the attribute list group. + * @see http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2 + */ + private static final String ATTLIST = "(.*?)"; // NOI18N + private static final String START = ""; // NOI18N + private static final String BODY = "(.*?)"; // NOI18N + private static final String END = "
"; // NOI18N + private static final String REGEX = START + BODY + END; + private static final int ATTLIST_GROUP = 1; + private static final int PARAMS_GROUP = 2; + private static final int MATCH_FLAGS = Pattern.CASE_INSENSITIVE| + Pattern.DOTALL|Pattern.MULTILINE; + + /** + * + * @param html + */ + public Object(HTML html) { + super(REGEX, MATCH_FLAGS, ATTLIST_GROUP); + } + + /** + * + * @return + */ + public Iterable getParams() { + int startParams = getMatcher().start(PARAMS_GROUP); + int endParams = getMatcher().end(PARAMS_GROUP); + @SuppressWarnings("unchecked") // the cast is correct + final Iterator i = new Param(startParams, endParams); + return new Iterable() { + public Iterator iterator() { + return i; + } + }; + } + + /** + * + */ + public class Param extends AttributableConstruction { + private static final String REGEX = ""; // NOI18N + private static final int ATTLIST_GROUP = 1; + private static final int MATCH_FLAGS = Pattern.CASE_INSENSITIVE| + Pattern.DOTALL|Pattern.MULTILINE; + + private Matcher paramMatcher; + + /** + * + * @param start + * @param end + */ + public Param(int start, int end) { + super(REGEX, MATCH_FLAGS, ATTLIST_GROUP); + Matcher parentMatcher = super.getMatcher(); + paramMatcher = parentMatcher.region(start, end); + } + + /** + * + * @return + */ + @Override + public Matcher getMatcher() { + return paramMatcher; + } + + } // Param + + } // Object + + /** + * Attribute of the element. + * This implementation has the following limitations: + *
    + *
  • Only default case of the attribute syntax specified by the HTML + * 4.01 Specification is supported:
    + * "By default, SGML requires that all attribute values be + * delimited using either double quotation marks (ASCII decimal + * 34) or single quotation marks (ASCII decimal 39).
    + * ...
    + * We recommend using quotation marks even when it is possible + * to eliminate them."
  • + *
  • DON'T SUPPORTED: "Single quote marks can be included within the + * attribute value when the value is delimited by double quote + * marks, and vice versa. Authors may also use numeric character + * references to represent double quotes (") and single quotes + * ('). For double quotes authors can also use the character + * entity reference &quot;
  • + *
  • DON'T SUPPORTED: In certain cases, authors may specify the value + * of an attribute without any quotation marks. The attribute value + * may only contain letters (a-z and A-Z), digits (0-9), hyphens + * (ASCII decimal 45), periods (ASCII decimal 46), underscores + * (ASCII decimal 95), and colons (ASCII decimal 58).
  • + *
+ * + * @see + * HTML 4.01 Specification - 3.2.2 Attributes + */ + public class Attribute extends Construction { + private static final String NAME = "(\\w*?)"; + private static final String SD = "(\"|')"; // Start delimiter + private static final String VALUE = "(.*?)"; + private static final String ED = "\\2"; // reffers to SD group + private static final String REGEX = + NAME + "=\\s*?" + SD + VALUE + ED; // NOI18N + private static final int NAME_GROUP = 1; + private static final int VALUE_GROUP = 3; + private static final int MATCH_FLAGS = + Pattern.CASE_INSENSITIVE|Pattern.DOTALL; // Not multiline! + + private Matcher attMatcher; + + /** + * + * @param start + * @param end + */ + public Attribute(int start, int end) { + super(REGEX, MATCH_FLAGS); + Matcher parentMatcher = super.getMatcher(); + attMatcher = parentMatcher.region(start, end); + } + + /** + * + * @return + */ + @Override + public Matcher getMatcher() { + return attMatcher; + } + + /** + * + * @return + */ + public String getName() { + return getMatcher().group(NAME_GROUP); + } + + /** + * + * @return + */ + public String getValue() { + return getMatcher().group(VALUE_GROUP); + } + } // Attribute + + } // HTML + + /** + * Development time test entry point. + * @param args the command line arguments + */ + public static void main(String[] args) { + +// String testHTML = " \n " + +// "\t \n" + +// "\t\t \n" + +// "\t\t\n" + +// "\t\t\n" + +// "\t\t\n" + +// "\t\t\n" + +// "\t\t\n" + +// "\t\t\n" + +// "\t\n" + +// "\n "+ +// "\t \n" + +// "\t\t \n" + +// "\t\t\n" + +// "\t\t\n" + +// "\t\t\n" + +// "\t\t\n" + +// "\t\t\n" + +// "\t\t\n" + +// "\t\n" + +// ""; +// HTML html = new HTML(testHTML); +// +// Iterable oi = html.getObjects(); +// for(HTML.Object e : oi) { +// System.out.println("start="+e.getStart()); +// System.out.println("end="+e.getEnd()); +// System.out.println(e.getText()); +// System.out.println("Attributes:"); +// Iterable ai = e.getAttributes(); +// for(HTML.Attribute a : ai) { +// System.out.println(a.getText()); +// System.out.println("name=["+ a.getName() + "] value=[" + a.getValue() + "]" ); +// } +// System.out.println("Parameters:"); +// Iterable pi = e.getParams(); +// for(HTML.Object.Param p : pi) { +// System.out.println(p.getText()); +// // Attributes of the PARAM element:; +// Iterable pai = p.getAttributes(); +// for(HTML.Attribute pa : pai) { +// String name=pa.getName(); // name of the attribute +// String value=pa.getValue(); // value of the attribute +// System.out.println("name=["+ name + "] value=[" + value + "]" ); +// // ... +// } +// } +// } + } +}