This is true and I apologize. java.net.url seems to behave slightly differently than we had assumed at first. Attached is a corrected sample output created with the same command line. Note that pages with frames like http://www.cs.cmu.edu/afs/cs/project/concert/www and http://www.cmu.edu/blackboard/ don't have any hyperlinks from them. Correctly handling frames would require a change to the FSM and is not necessary for completing HW5 for full credit. This is because our FSM could really be of arbitrary complexity to handle more and more cases, but we need to set our cutoff somewhere so that the class can finish the assignment by the deadline.
Also, anyone using the provided PageLexer.java should make the following correction. Simply replace the Action inner class of PageLexer.java with the following code so that relative URLs will be handled correctly:
private class Action {
// Remove trailing '/' characters from a string String fixer1(String s) { while((s.length() > 0) && (s.charAt(s.length()-1)=='/')) s = s.substring(0,s.length()-1); return s; }
// Remove everything after the last '/' character unless the last // characters are '/'s String fixer2(String s) { s = fixer1(s); while((s.length() > 0) && (s.charAt(s.length()-1)!='/')) s = s.substring(0,s.length()-1); return fixer1(s); }
void doit (int state) { switch (state) { case 0: break; // In state 1 we have parsed a keyword, so add it. case 1: elts.add(new PageWord(tokenStream.sval)); break; case 2: break; // In state 3 we have parsed a number, so add it. case 3: elts.add(new PageNum(tokenStream.nval)); break; case 4: break; case 5: break; case 6: break; case 7: break; case 8: break; case 9: // In state 9 we have parsed a hyperlink, so check if it // is OK, and if it is, then add it. try { // See if it is a valid URL elts.add(new PageHref(tokenStream.sval)); } catch (MalformedURLException e) { try { // See if it is an a URL relative to this context or this page if(tokenStream.sval.charAt(0)=='/') elts.add(new PageHref(url, tokenStream.sval)); else { URL toTry = new URL(fixer1(url.toString()) + "/" + tokenStream.sval); try { URLTextReader ut = new URLTextReader(toTry); HttpTokenizer ht = new HttpTokenizer(ut); if(ht.nextToken() != HttpTokenizer.HT_EOF) elts.add(new PageHref(toTry.toString())); else elts.add(new PageHref(fixer2(url.toString()) + "/" + tokenStream.sval)); } catch (IOException e1) { elts.add(new PageHref(fixer2(url.toString()) + "/" + tokenStream.sval)); } } } catch (MalformedURLException e2) { ; } } } } } |