Snippets

Elvis Morales Find all URLs on a webpage and use StorageResource to print multiple information.

Created by Elvis Morales last modified
/**
 * Find all URLs on a webpage and use StorageResource to print multiple information. 
 * 
 * @author (Elvis Morales) 
 * @version (1.0)
 */

import edu.duke.*;

public class FindAllURLs {
    
    public StorageResource findURLs( String url ) {
        
        URLResource pageCont = new URLResource( url );
        
        String pageContStr = pageCont.asString();
        
        StorageResource urlsFound = new StorageResource();
        
        int start = 0;        
        
        while (true) {
            
          int urlStart = pageContStr.indexOf( "href=", start );
            
          if ( urlStart == -1 ) {
              break;
          }
            
          int urlStop = pageContStr.indexOf( "\"", urlStart+6 );
            
          if ( urlStop == -1 ) {
              break;
          }
          
          String hrefVal = pageContStr.substring( urlStart+6, urlStop );
          
          if( hrefVal.startsWith("http") ) {
              urlsFound.add( hrefVal );
          }
          
          
          start = urlStop;
               
        }
        
        return urlsFound;
    }
    
    public void testURLWithStorage() {
        
        StorageResource urls = findURLs( "http://www.dukelearntoprogram.com/course2/data/manylinks.html" );
        int secLinks = 0;
        int dotComLinks = 0;
        int dotComEnd = 0;
        int dotsQty = 0;
        
        for ( String url : urls.data() ) {
           
            System.out.println( url );
            
            if( url.startsWith("https") ) {
                secLinks += 1;
            }
            
            if( url.contains(".com") ) {
                dotComLinks += 1;
            }
            
            if( url.endsWith(".com") || url.endsWith(".com/") ) {
                dotComEnd += 1;
            }
            
            int start = 0;
            
            while (true) {
              start = url.indexOf( ".", start );
                
              if ( start == -1 ) {
                  break;
              }
              
              dotsQty += 1;
              start += 1;
                   
            }
             
        }
        
        System.out.println( "URLs found: "+urls.size() );
        System.out.println( "Secure links found: "+secLinks );
        System.out.println( "Dot com links: "+dotComLinks );
        System.out.println( "Ends with dot com: "+dotComEnd );
        System.out.println( "Number of dots: "+dotsQty );
        
    }
    
}

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.