#!/usr/bin/perl

# Perl script to retrieve the attachments of a mikrocontroller.net thread which was saved
# with firefox (mode: whole page).
# by Michael Gross

# call syntax: scipt.pl < input.file > output.file

use warnings;

our %found_links;
our @file;

sub get_id_and_filename($) {
  my @result;
  $_ = shift;

  if (/.*attachment\/([0-9]+)\/(.+)/) {
    push @result, $1;
    push @result, $2;
  }

  return @result;
}

# read the HTML file from stdin and extract the links
while (<>) {
  my $line = $_;

  if (/<a href="(.*attachment[^"]*)".*/) {
    $link = $1;
    if (!defined $found_links{$1}) {
      $found_links{$link} = 1;
    }
    
    # replace the link with the local version
    @id = get_id_and_filename($link);
    if (scalar @id == 2) {
      $line =~ s/$link/.\/attachments\/$id[0]\/$id[1]/;
    }
  }
  
  push @file, $line;
}

warn "found a total of ".(scalar keys %found_links)." attachments...\n";

if (!-e "./attachments") {
  mkdir "./attachments" or die "cannot create directory: $!\n";
}

# get the files with wget
foreach (keys %found_links) {
   # extract the file name
   my @id = get_id_and_filename($_); 
   mkdir "./attachments/$id[0]" or warn "cannot create directory: $!\n";
   `wget $_ -O ./attachments/$id[0]/$id[1]`
}

# output the modified HTML file
foreach (@file) {
  print $_;
}
