From 78ad431fbe9e1a166d17ed46572c4ba80ca47e4a Mon Sep 17 00:00:00 2001 From: Manuel Cillero Date: Wed, 26 Jul 2017 13:33:19 +0200 Subject: [PATCH] New module 'Link checker' --- sites/all/modules/linkchecker/CHANGELOG.txt | 178 ++ sites/all/modules/linkchecker/LICENSE.txt | 339 +++ sites/all/modules/linkchecker/README.txt | 59 + .../includes/linkchecker.admin.inc | 367 +++ .../includes/linkchecker.batch.inc | 280 +++ .../includes/linkchecker.pages.inc | 256 +++ .../all/modules/linkchecker/linkchecker.info | 10 + .../modules/linkchecker/linkchecker.install | 670 ++++++ .../modules/linkchecker/linkchecker.module | 1959 +++++++++++++++++ .../tests/linkchecker_extract_links.test | 441 ++++ 10 files changed, 4559 insertions(+) create mode 100644 sites/all/modules/linkchecker/CHANGELOG.txt create mode 100644 sites/all/modules/linkchecker/LICENSE.txt create mode 100644 sites/all/modules/linkchecker/README.txt create mode 100644 sites/all/modules/linkchecker/includes/linkchecker.admin.inc create mode 100644 sites/all/modules/linkchecker/includes/linkchecker.batch.inc create mode 100644 sites/all/modules/linkchecker/includes/linkchecker.pages.inc create mode 100644 sites/all/modules/linkchecker/linkchecker.info create mode 100644 sites/all/modules/linkchecker/linkchecker.install create mode 100644 sites/all/modules/linkchecker/linkchecker.module create mode 100644 sites/all/modules/linkchecker/tests/linkchecker_extract_links.test diff --git a/sites/all/modules/linkchecker/CHANGELOG.txt b/sites/all/modules/linkchecker/CHANGELOG.txt new file mode 100644 index 0000000..e37ad62 --- /dev/null +++ b/sites/all/modules/linkchecker/CHANGELOG.txt @@ -0,0 +1,178 @@ +linkchecker 6.x-2.8, June 7, 2014 +--------------------------------- + +* Issue #2280827: Notice: Undefined index: content-type in _linkchecker_status_handling() +* Issue #2125719 by hass: Links in unpublished nodes are still checked +* Issue #2088461 by fonant, hass: PDF link with #page= results in "URL fragment identifier not found in content" +* Added Smileys filter (http://drupal.org/project/smiley) to filter blacklist +* Issue #1306498 by hass, evolvingweb: Remove unnecessary hook_requirements in linkchecker.install +* Issue #1923328 by larowlan: Fragment check fails when source page contains spaces between attribute name, equals and value. +* Issue #1926558 by hass: Block extraction re-scans fail. + +linkchecker 6.x-2.7, February 14, 2013 +-------------------------------------- + +* Issue #1901018 by hass: ERROR - __clone method called on non-object. +* #1891900: Uninstall of variable 'linkchecker_check_connections_max' failed. +* #1719174: Editing node throws "Undefined property" stdClass::$nid in hook_node_prepare(). +* #1875602: Check URL fragment identifiers in content +* Minor code style fixes + +linkchecker 6.x-2.6, December 24, 2012 +-------------------------------------- + +* Fixed some typos. +* Upgraded outdated HTTP user agents. +* #380052: Add support with non-blocking parallel link checking. +* linkchecker_update_6211 was missing an update message. +* #1869924: Strict warning: Creating default object from empty value. +* #1867460: Prevent save on automatic updates, if content has not changed. +* #1804842 follow up: Exit if all node types are disabled +* #1450672: Cron task does not impersonate to admin user on automatic content updates. +* Added Drupal 6.14 warning to known issues. +* #1804842: Lots of warnings when comment module is not installed +* #1811908: Allow extracting protocol relative url's +* #1586022: linkchecker generating bad User-Agent string +* Status code 500 on HEAD/GET does not increase fail_count +* #1488572: i18n: "Permission restrictions deny" all broken links in other languages + +linkchecker 6.x-2.5, February 15, 2012 +-------------------------------------- + +* Fix for access bypass vulnerability. +* #1429284: Don't follow redirects +* Removed block 'title' for consitency reasons. It's only a title in administration and should not contain URLs +* Show a recommendation next to blacklisted filter names. +* #1149596: Links with spaces are not extracted +* Backported some documentation fixes +* Batch redirect URL added for blocks +* #860700: Disable all non-http(s) links for link checking until we have curl implemented +* #1054118: After batch scan the target url is not pointing to the node view +* #1075390: Added HTTP range support (Performance) +* Links disabled for checking (blacklist, never checked) will be counted as as unchecked and the number of unchecked links will therefore never become zero. +* #1321378: Improve performance of queries +* #953958: User able to see other user broken links +* truncate is ***much*** more faster than delete. +* #725398: variable_get() may not be available in the install.php run +* First decode, than filter as URLs could theoretically consist of encoded blanks only. +* #1067160 by somimi: Minor grammar fixes to some of the descriptions in linkchecker.pages.inc +* Removing translation directories +* #903318: Internal Links in cck, weblinks and links fields are not extracted. +* Remove locale module setUp as notices in _linkchecker_add_node_links() has been fixed. +* Prevent notices if tests run without locale module. +* #780242 by AlexisWilke: Several E_NOTICE when dealing with the URLs +* #500090 by hass: Block removal of RFC domains in exclude links +* #685580 by wheyse: CCK field_name need to be checked with empty() as the value could be empty +* Upgrade line break converter from D5 filter/2 to D6 filter/1. +* #661484: Empty URL exclusion list disables link checking + +linkchecker 6.x-2.4, November 24, 2009 +-------------------------------------- + +* #566388 by wimh, hass: Re-check link status on next cron run +* #632732 by sleepcamel, hass: Add a message to $node->log when saving a new revision +* #627774 by charmer: Linkchecker Czech Translation +* Replace word "node" in text with "content" for easier translation +* #580248: Prevent link check failure if 'max_execution_time' = 0 (unlimited). +* Blacklisted "smileys/0" filter as the URLs are maintained in the Smiley module settings +* Fixed wrong variable name in single comment import function +* Updated README.txt +* Cleanup / refactoring _linkchecker_extract_links() +* #563464: Internal links reported as broken incorrectly on SSL only site. Use $base_root as site URL for prefixing absolute internal URLs. +* E_ALL saver... +* Better menu item description, D7 backport. +* Dynamic destination for user broken links page +* Use theme function for item_list +* Dynamic colspan for no data +* Joined wrong fields in user report with comments enabled +* Filters do not run on content as expected. +* db_last_insert_id('boxes', 'bid') could return a wrong value if other contrib modules add block hooks and insert something in other tables. +* Make all count() upper case +* Show items in a list +* Filter $link->error for arbitrary code +* #532178 by hass: Database update #6209 failed +* #196862 by hass: COUNT(*) is an expensive query in InnoDB. +* Added links extraction .test +* E_ALL: Fixed notice for undefined $node->language. +* E_ALL: Fixed notice caused by array_map(md5, $links) +* Typo fixed +* Changed unique key and column name 'token' into 'urlhash'. +* Added filter blacklist for link extraction. +* Remove reference to $form_state in hook_form_alter() +* Fix comments +* Add $node->type check +* Added support for node type 'weblinks' from 'weblinks' modules +* D5: check_plain() all '@' string values for the use with strtr(). + + +linkchecker 6.x-2.3, August 1, 2009 +----------------------------------- + +By this release linkchecker no longer depends on job_queue! + +* by wimh: Copy an paste error, used _linkchecker_node_links_missing() in _linkchecker_add_comment_links() +* #536744 by wimh, hass: Add broken links warning message to block edit page on view, but not after save. +* #536744 by wimh, hass: Fix comment edit form to show the broken links warning message only on 'view' and 'preview', but not after save. +* Fix inline documenation typos +* Code style, variable names, constant for limit, doc fixes, others +* #536672 by wimh, hass: Link references not updated when last link of a node/comment/block is removed +* More comments added. +* Updated translation files +* Use update_sql() for permission updates +* Save form settings on link scan and clear. +* #497096: Support links generated by input filter +* Remove duplicate lid's in SQL results +* #465414: Followup for relative links extracted incorrectly +* Note that custom link settings are lost on every clean and re-analyse task +* Added GUI for link check URL filter +* #524380: Do not check example.com/net/org domain links +* Updated installation steps in README.txt +* Added sample URLs for internal and external to description +* Changed permission name "access linkchecker" to "access broken links report" +* Added the status code 302 to the list of ignored response codes. +* Select from comment table only, if comment modules is active. +* Remove ignored links from broken links view + + +linkchecker 6.x-2.2, July 14, 2009 +---------------------------------- + +* Removed confusing watchdog message. +* Use default node type revision setting and save a revision on 301 auto-update. +* UX: Provide more descriptive help what the HTML tags are for. +* #451456: 301 auto-update could break links +* #452080: Update permanently moved internal links is broken +* Removed duplicate _linkchecker_add_xxx_links() calls. node_save automaticallly calls hook_nodeapi and comment_save automatically calls hook_comment with 'update'. +* Added a link to the job_queue module project page +* #465414: Stop incorrect extraction of relative links +* Typo fixed in batch finished message +* Removed unused default case +* Removed UI setting no longer having a real use case (and reduce complexity) +* Deleted the associated variable. +* One more chance to extract a URL (for e.g. Java Applet) +* Added support for HTML5 tags (video|audio|source) +* Uncomment cURL requirement until implemented. +* #402742: Integration with path_redirect +* #427906, #417354: Extract links from various configurable HTML tags (a|area|img|iframe|object|param|embed) +* #504332: Cannot install, HTTP request status fails +* Show edit link only if user have permission to change settings +* New permission to edit link settings +* Add UI to change request method +* Add ignore filter for links pointing to buggy servers + + +linkchecker 6.x-2.1, March 26, 2009 +----------------------------------- + +* #409324 by Dave Reid: _linkchecker_delete_node_links() expects a nid, but passed a node object +* #386612 by hass: Prevent possible cron failures caused by too many link checks +* #408860 by Dave Reid: PHP notice: Undefined property: stdClass::$moderate in linkchecker.module on line 428. +* #373485 by hass: DB errors when adding a new node +* #387228: linkchecker.module fails in PHP4 on line 910 +* #375767: Wrong table aliases and column names for comments and boxes +* Added MySQL 5.0 requirement check +* Show broken link message on node prepare, not view +* #380548: No results after configuration changes +* #380476: Descriptive text in Admin confusing +* #381986: Broken link does not immediately disappear in broken links list after link has been fixed in content +* #380872 by hass: Show informative message about current check status on reports page. diff --git a/sites/all/modules/linkchecker/LICENSE.txt b/sites/all/modules/linkchecker/LICENSE.txt new file mode 100644 index 0000000..d159169 --- /dev/null +++ b/sites/all/modules/linkchecker/LICENSE.txt @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/sites/all/modules/linkchecker/README.txt b/sites/all/modules/linkchecker/README.txt new file mode 100644 index 0000000..7f32048 --- /dev/null +++ b/sites/all/modules/linkchecker/README.txt @@ -0,0 +1,59 @@ + +Link Checker +------------ + +Installation: + +1. Place the entire linkchecker folder into your modules directory. +2. Go to Administer -> Site building -> Modules and enable the Link checker module. +3. Go to Administer -> Site configuration -> Link checker and enable the node types to scan. +4. Check all HTML tags that should be scanned. +5. Adjust the other parameters if the defaults don't suit your needs. +6. Save configuration +7. Wait for cron to check all your links... this may take some time! :-) + +If links are broken they appear under Administer -> Reports -> Broken links. + +If not, make sure the cron is configured and running properly on your Drupal +installation. The Link checker module also logs somewhat useful info about it's +activity under Administer -> Reports -> Recent log entries. + + +Required: + +1. For internal URL extraction you need to make sure that Cron always get called + with your real public site URL (for e.g. http://example.com/cron.php). Make + sure it's never executed with http://localhost/cron.php or any other + hostnames or ports, not available from public. Otherwise all links may be + reported as broken and cannot verified as they should be. + + To make sure it always works - it's required to configure the $base_url in + the sites settings.php with your public sites URL. Better safe than sorry! + + +Known issues: + +There are a lot of known issues in drupal_http_request(). These have been solved +in HTTPRL. As a workaround it's recommended to use HTTPRL in linkchecker. + +Issues list: + +* #997648: drupal_http_request() always calls fread() one more time than necessary +* #164365-12: drupal_http_request() does handle (invalid) non-absolute redirects +* #205969-11: drupal_http_request() assumes presence of Reason-Phrase in response Status-Line +* #371495: Error message from drupal_http_request() not UTF8 encoded +* #193073-11: drupal_http_request - socket not initialized +* #106506-8: drupal_http_request() does not handle 'chunked' responses - Make it support HTTP 1.1 +* #1096890-15: drupal_http_request should return error if reaches max allowed redirects +* #875342-21: drupal_http_request() should pick up X-Drupal-Assertion-* HTTP headers +* #965078-31: HTTP request checking is unreliable and should be removed in favor of watchdog() calls +* #336367: HTTP client should protect commas when folding (compatibility with legacy HTTP/1.0) +* #45338: log fsockopen errors to watchdog + + +Other + +* Drupal 6.14 only: A critical core bug has been introduced by #193383: + set_time_limit: Centralize calls and prevent warnings and errors that stops + link checker from verifying links. Apply the available D6 hotfix in #111 or + update to Drupal 6.15+ to fix your installation. diff --git a/sites/all/modules/linkchecker/includes/linkchecker.admin.inc b/sites/all/modules/linkchecker/includes/linkchecker.admin.inc new file mode 100644 index 0000000..375e560 --- /dev/null +++ b/sites/all/modules/linkchecker/includes/linkchecker.admin.inc @@ -0,0 +1,367 @@ + 'fieldset', + '#title' => t('General settings'), + '#collapsible' => FALSE, + ); + $form['settings']['linkchecker_scan_nodetypes'] = array( + '#type' => 'checkboxes', + '#title' => t('Scan node types for links'), + '#default_value' => variable_get('linkchecker_scan_nodetypes', array()), + '#options' => array_map('check_plain', node_get_types('names')), + '#description' => t('Enable link checking for the selected node type(s).'), + ); + + $comment_dependencies = '
'; + $comment_dependencies .= t('Depends on: !dependencies', array('!dependencies' => (module_exists('comment') ? t('@module (enabled)', array('@module' => 'Comment')) : t('@module (disabled)', array('@module' => 'Comment'))))); + $comment_dependencies .= '
'; + + $form['settings']['linkchecker_scan_comments'] = array( + '#default_value' => variable_get('linkchecker_scan_comments', 0), + '#type' => 'checkbox', + '#title' => t('Scan comments for links'), + '#description' => t('Enable this checkbox if links in comments of the above selected node type(s) should be checked.') . $comment_dependencies, + '#disabled' => module_exists('comment') ? FALSE : TRUE, + ); + $form['settings']['linkchecker_scan_blocks'] = array( + '#default_value' => variable_get('linkchecker_scan_blocks', 0), + '#type' => 'checkbox', + '#title' => t('Scan blocks for links'), + '#description' => t('Enable this checkbox if links in blocks should be checked.'), + ); + $form['settings']['linkchecker_fqdn_only'] = array( + '#type' => 'select', + '#title' => t('What type of links should be checked?'), + '#description' => t('A full qualified link (http://example.com/foo/bar) to a page is considered external, whereas an absolute (/foo/bar) or relative link (node/123) without a domain is considered internal.'), + '#default_value' => variable_get('linkchecker_fqdn_only', 1), + '#options' => array( + '0' => t('Internal and external'), + '1' => t('External only (http://example.com/foo/bar)'), + '2' => t('Internal only (node/123)'), + ), + ); + + $form['tag'] = array( + '#type' => 'fieldset', + '#title' => t('Link extraction'), + '#collapsible' => FALSE, + ); + $form['tag']['linkchecker_extract_from_a'] = array( + '#default_value' => variable_get('linkchecker_extract_from_a', 1), + '#type' => 'checkbox', + '#title' => t('Extract links in <a> and <area> tags'), + '#description' => t('Enable this checkbox if normal hyperlinks should be extracted. The anchor element defines a hyperlink, the named target destination for a hyperlink, or both. The area element defines a hot-spot region on an image, and associates it with a hypertext link.'), + ); + $form['tag']['linkchecker_extract_from_audio'] = array( + '#default_value' => variable_get('linkchecker_extract_from_audio', 0), + '#type' => 'checkbox', + '#title' => t('Extract links in <audio> tags'), + '#description' => t('Enable this checkbox if links in audio tags should be extracted. The audio element is used to embed audio content.'), + ); + $form['tag']['linkchecker_extract_from_embed'] = array( + '#default_value' => variable_get('linkchecker_extract_from_embed', 0), + '#type' => 'checkbox', + '#title' => t('Extract links in <embed> tags'), + '#description' => t('Enable this checkbox if links in embed tags should be extracted. This is an obsolete and non-standard element that was used for embedding plugins in past and should no longer used in modern websites.'), + ); + $form['tag']['linkchecker_extract_from_iframe'] = array( + '#default_value' => variable_get('linkchecker_extract_from_iframe', 0), + '#type' => 'checkbox', + '#title' => t('Extract links in <iframe> tags'), + '#description' => t('Enable this checkbox if links in iframe tags should be extracted. The iframe element is used to embed another HTML page into a page.'), + ); + $form['tag']['linkchecker_extract_from_img'] = array( + '#default_value' => variable_get('linkchecker_extract_from_img', 0), + '#type' => 'checkbox', + '#title' => t('Extract links in <img> tags'), + '#description' => t('Enable this checkbox if links in image tags should be extracted. The img element is used to add images to the content.'), + ); + $form['tag']['linkchecker_extract_from_object'] = array( + '#default_value' => variable_get('linkchecker_extract_from_object', 0), + '#type' => 'checkbox', + '#title' => t('Extract links in <object> and <param> tags'), + '#description' => t('Enable this checkbox if multimedia and other links in object and their param tags should be extracted. The object tag is used for flash, java, quicktime and other applets.'), + ); + $form['tag']['linkchecker_extract_from_source'] = array( + '#default_value' => variable_get('linkchecker_extract_from_source', 0), + '#type' => 'checkbox', + '#title' => t('Extract links in <source> tags'), + '#description' => t('Enable this checkbox if links in source tags should be extracted. The source element is used to specify multiple media resources for audio and video elements.'), + ); + $form['tag']['linkchecker_extract_from_video'] = array( + '#default_value' => variable_get('linkchecker_extract_from_video', 0), + '#type' => 'checkbox', + '#title' => t('Extract links in <video> tags'), + '#description' => t('Enable this checkbox if links in video tags should be extracted. The video element is used to embed video content.'), + ); + + // Get all filters available on the system. + $filters = filter_list_all(); + $filter_options = array(); + foreach ($filters as $filter) { + $name = $filter->module . '/' . $filter->delta; + if (in_array($name, explode('|', LINKCHECKER_DEFAULT_FILTER_BLACKLIST))) { + $filter_options[$name] = t('!title (Recommended)', array('!title' => $filter->name)); + } + else { + $filter_options[$name] = $filter->name; + } + } + $form['tag']['linkchecker_filter_blacklist'] = array( + '#type' => 'checkboxes', + '#title' => t('Filters disabled for link extraction'), + '#default_value' => variable_get('linkchecker_filter_blacklist', explode('|', LINKCHECKER_DEFAULT_FILTER_BLACKLIST)), + '#options' => $filter_options, + '#description' => t('If a filter has been enabled for an input format it runs first and afterwards the link extraction. This helps the link checker module to find all links normally created by custom filters (e.g. Markdown filter, Bbcode). All filters used as inline references (e.g. Weblink filter [link: id]) to other content and filters only wasting processing time (e.g. Line break converter) should be disabled. This setting does not have any effect on how content is shown on a page. This feature optimizes the internal link extraction process for link checker and prevents false alarms about broken links in content not having the real data of a link.'), + ); + + $count_lids_enabled = db_result(db_query("SELECT count(lid) FROM {linkchecker_links} WHERE status = %d", 1)); + $count_lids_disabled = db_result(db_query("SELECT count(lid) FROM {linkchecker_links} WHERE status = %d", 0)); + $form['check'] = array( + '#type' => 'fieldset', + '#title' => t('Check settings'), + '#description' => t('For simultaneous link checks it is recommended to install the HTTP Parallel Request & Threading Library. This may be necessary on larger sites with very many links (30.000+), but will also improve overall link check duration on smaller sites. Currently the site has @count links (@count_enabled enabled / @count_disabled disabled).', array('@httprl' => 'http://drupal.org/project/httprl', '@count' => $count_lids_enabled+$count_lids_disabled, '@count_enabled' => $count_lids_enabled, '@count_disabled' => $count_lids_disabled)), + '#collapsible' => FALSE, + ); + $form['check']['linkchecker_check_library'] = array( + '#type' => 'select', + '#title' => t('Check library'), + '#description' => t('Defines the library that is used for checking links.'), + '#default_value' => variable_get('linkchecker_check_library', 'core'), + '#options' => array( + 'core' => t('Drupal core'), + 'httprl' => t('HTTP Parallel Request & Threading Library'), + ), + ); + $form['check']['linkchecker_check_connections_max'] = array( + '#type' => 'select', + '#title' => t('Number of simultaneous connections'), + '#description' => t('Defines the maximum number of simultaneous connections that can be opened by the server. HTTP Parallel Request & Threading Library make sure that a single domain is not overloaded beyond RFC limits. For small hosting plans with very limited CPU and RAM it may be required to reduce the default limit.'), + '#default_value' => variable_get('linkchecker_check_connections_max', 8), + '#options' => drupal_map_assoc(array(2, 4, 8, 16, 24, 32, 48, 64, 96, 128)), + ); + $form['check']['linkchecker_check_useragent'] = array( + '#type' => 'select', + '#title' => t('User-Agent'), + '#description' => t('Defines the user agent that will be used for checking links on remote sites. If someone blocks the standard Drupal user agent you can try with a more common browser.'), + '#default_value' => variable_get('linkchecker_check_useragent', 'Drupal (+http://drupal.org/)'), + '#options' => array( + 'Drupal (+http://drupal.org/)' => 'Drupal (+http://drupal.org/)', + 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)' => 'Windows 7 (x64), Internet Explorer 9.0', + 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:17.0) Gecko/20100101 Firefox/17.0' => 'Windows 7 (x64), Mozilla Firefox 17.0', + 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)' => 'Windows 8 (x64), Internet Explorer 10.0', + 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:17.0) Gecko/20100101 Firefox/17.0' => 'Windows 8 (x64), Mozilla Firefox 17.0', + ), + ); + $form['check']['linkchecker_check_links_interval'] = array( + '#type' => 'select', + '#title' => t('Check interval for links'), + '#description' => t('This interval setting defines how often cron will re-check the status of links.'), + '#default_value' => variable_get('linkchecker_check_links_interval', 2419200), + '#options' => drupal_map_assoc(array(86400, 172800, 259200, 604800, 1209600, 2419200, 4838400), 'format_interval'), + ); + $form['check']['linkchecker_disable_link_check_for_urls'] = array( + '#default_value' => variable_get('linkchecker_disable_link_check_for_urls', LINKCHECKER_RESERVED_DOCUMENTATION_DOMAINS), + '#type' => 'textarea', + '#title' => t('Do not check the link status of links containing these URLs'), + '#description' => t('By default this list contains the domain names reserved for use in documentation and not available for registration. See RFC 2606, Section 3 for more information. URLs on this list are still extracted, but the link setting Check link status becomes automatically disabled to prevent false alarms. If you change this list you need to clear all link data and re-analyze your content. Otherwise this setting will only affect new links added after the configuration change.', array('@rfc-2606' => 'http://www.rfc-editor.org/rfc/rfc2606.txt')), + '#wysiwyg' => FALSE, + ); + + $form['error'] = array( + '#type' => 'fieldset', + '#title' => t('Error handling'), + '#description' => t('Defines error handling and custom actions to be executed if specific HTTP requests are failing.'), + '#collapsible' => FALSE, + ); + $linkchecker_default_impersonate_user = user_load(1); + $form['error']['linkchecker_impersonate_user'] = array( + '#type' => 'textfield', + '#title' => t('Impersonate user account'), + '#description' => t('If below error handling actions are executed they can be impersonated with a custom user account. By default this is user %name, but you are able to assign a custom user to allow easier identification of these automatic revision updates. Make sure you select a user with full permissions on your site or the user may not able to access and save all content.', array('%name' => $linkchecker_default_impersonate_user->name)), + '#size' => 30, + '#maxlength' => 60, + '#autocomplete_path' => 'user/autocomplete', + '#default_value' => variable_get('linkchecker_impersonate_user', ''), + ); + $form['error']['linkchecker_action_status_code_301'] = array( + '#title' => t('Update permanently moved links'), + '#description' => t('If enabled, outdated links in content providing a status Moved Permanently (status code 301) are automatically updated to the most recent URL. If used, it is recommended to use a value of three to make sure this is not only a temporarily change. This feature trust sites to provide a valid permanent redirect. A new node revision is automatically created on link updates if create new revision is enabled in the node type workflow settings. It is recommended to create new revisions for all link checker enabled node types. Link updates are nevertheless always logged in recent log entries.', array('@dblog' => url('admin/reports/dblog'), '@node_types' => url('admin/content/types'))), + '#type' => 'select', + '#default_value' => variable_get('linkchecker_action_status_code_301', 0), + '#options' => array( + 0 => t('Disabled'), + 1 => t('After one failed check'), + 2 => t('After two failed checks'), + 3 => t('After three failed checks'), + 5 => t('After five failed checks'), + 10 => t('After ten failed checks'), + ), + ); + $form['error']['linkchecker_action_status_code_404'] = array( + '#title' => t('Unpublish node on file not found error'), + '#description' => t('If enabled, a node with one or more broken links (status code 404) will be unpublished and moved to moderation queue for review after the number of specified checks failed. If used, it is recommended to use a value of three to make sure this is not only a temporarily error.'), + '#type' => 'select', + '#default_value' => variable_get('linkchecker_action_status_code_404', 0), + '#options' => array( + 0 => t('Disabled'), + 1 => t('After one file not found error'), + 2 => t('After two file not found errors'), + 3 => t('After three file not found errors'), + 5 => t('After five file not found errors'), + 10 => t('After ten file not found errors'), + ), + ); + $form['error']['linkchecker_ignore_response_codes'] = array( + '#default_value' => variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403"), + '#type' => 'textarea', + '#title' => t("Don't treat these response codes as errors"), + '#description' => t('One HTTP status code per line, e.g. 403.'), + '#wysiwyg' => FALSE, + ); + + // Buttons are only required for testing and debugging reasons. + $description = '

' . t('These actions will either clear all link checker tables in the database and/or analyze all selected node types, blocks and cck fields (see settings above) for new/updated/removed links. Normally there is no need to press one of these buttons. Use this only for immediate cleanup tasks and to force a full re-build of the links to be checked in the linkchecker tables. Keep in mind that all custom link settings will be lost if you clear link data!') . '

'; + $description .= '

' . t('Note: These functions ONLY collect the links, they do not evaluate the HTTP response codes, this will be done during normal cron runs.') . '

'; + + $form['clear'] = array( + '#type' => 'fieldset', + '#title' => t('Maintenance'), + '#description' => $description, + '#collapsible' => TRUE, + '#collapsed' => TRUE, + ); + $form['clear']['linkchecker_analyze'] = array( + '#type' => 'submit', + '#value' => t('Reanalyze content for links'), + '#submit' => array('linkchecker_analyze_links_submit'), + ); + $form['clear']['linkchecker_clear_analyze'] = array( + '#type' => 'submit', + '#value' => t('Clear link data and analyze content for links'), + '#submit' => array('linkchecker_clear_analyze_links_submit'), + ); + + $form['buttons']['submit'] = array('#type' => 'submit', '#value' => t('Save configuration') ); + $form['buttons']['reset'] = array('#type' => 'submit', '#value' => t('Reset to defaults') ); + + return $form; +} + +function linkchecker_admin_settings_form_validate($form, &$form_state) { + $form_state['values']['linkchecker_disable_link_check_for_urls'] = trim($form_state['values']['linkchecker_disable_link_check_for_urls']); + $form_state['values']['linkchecker_ignore_response_codes'] = trim($form_state['values']['linkchecker_ignore_response_codes']); + $ignore_response_codes = preg_split('/(\r\n?|\n)/', $form_state['values']['linkchecker_ignore_response_codes']); + foreach ($ignore_response_codes as $ignore_response_code) { + if (!_linkchecker_isvalid_response_code($ignore_response_code)) { + form_set_error('linkchecker_ignore_response_codes', t('Invalid response code %code found.', array('%code' => $ignore_response_code))); + } + } + + // Prevent the removal of RFC documentation domains. This are the official and + // reserved documentation domains and not "example" hostnames! + $linkchecker_disable_link_check_for_urls = array_filter(preg_split('/(\r\n?|\n)/', $form_state['values']['linkchecker_disable_link_check_for_urls'])); + $form_state['values']['linkchecker_disable_link_check_for_urls'] = implode("\n", array_unique(array_merge(explode("\n", LINKCHECKER_RESERVED_DOCUMENTATION_DOMAINS), $linkchecker_disable_link_check_for_urls))); + + // Validate impersonation user name. + $linkchecker_impersonate_user = user_load(array('name' => $form_state['values']['linkchecker_impersonate_user'])); + if (empty($linkchecker_impersonate_user->uid)) { + form_set_error('linkchecker_impersonate_user', t('User account %name cannot found.', array('%name' => $form_state['values']['linkchecker_impersonate_user']))); + } +} + +function linkchecker_admin_settings_form_submit($form, &$form_state) { + // Exclude unnecessary elements. + unset($form_state['values']['linkchecker_analyze'], $form_state['values']['linkchecker_clear_analyze']); + + // Save form settings. + system_settings_form_submit($form, $form_state); + + // Re-scan items, if node types or comment or block selection have been changed. + $additional_nodetypes_selected = array_diff($form_state['values']['linkchecker_scan_nodetypes'], $form['settings']['linkchecker_scan_nodetypes']['#default_value']); + if (!empty($additional_nodetypes_selected) || $form_state['values']['linkchecker_scan_comments'] > $form['settings']['linkchecker_scan_comments']['#default_value']) { + $node_types = array_keys(array_filter($form_state['values']['linkchecker_scan_nodetypes'])); + + // If one or more node types have been selected. + if (!empty($node_types)) { + module_load_include('inc', 'linkchecker', '/includes/linkchecker.batch'); + batch_set(_linkchecker_batch_import_nodes($node_types)); + + // If comment scanning of node types has been selected. + if ($form_state['values']['linkchecker_scan_comments'] > $form['settings']['linkchecker_scan_comments']['#default_value']) { + batch_set(_linkchecker_batch_import_comments($node_types)); + } + } + } + + // If block scanning has been selected. + if ($form_state['values']['linkchecker_scan_blocks'] > $form['settings']['linkchecker_scan_blocks']['#default_value']) { + module_load_include('inc', 'linkchecker', '/includes/linkchecker.batch'); + batch_set(_linkchecker_batch_import_boxes()); + } +} + +/** + * Submit callback; Analyze all node types, boxes and cck fields. + */ +function linkchecker_analyze_links_submit($form, &$form_state) { + // Exclude unnecessary elements. + unset($form_state['values']['linkchecker_analyze'], $form_state['values']['linkchecker_clear_analyze']); + + // Save form settings. + system_settings_form_submit($form, $form_state); + + // Start batch and analyze all nodes. + $node_types = array_keys(array_filter(variable_get('linkchecker_scan_nodetypes', array()))); + if (!empty($node_types)) { + module_load_include('inc', 'linkchecker', '/includes/linkchecker.batch'); + batch_set(_linkchecker_batch_import_nodes($node_types)); + if (variable_get('linkchecker_scan_comments', 0)) { + batch_set(_linkchecker_batch_import_comments($node_types)); + } + } + + if (variable_get('linkchecker_scan_blocks', 0)) { + module_load_include('inc', 'linkchecker', '/includes/linkchecker.batch'); + batch_set(_linkchecker_batch_import_boxes()); + } +} + +/** + * Submit callback; Clear link data and analyze all node types, boxes and cck fields. + */ +function linkchecker_clear_analyze_links_submit($form, &$form_state) { + // Exclude unnecessary elements. + unset($form_state['values']['linkchecker_analyze'], $form_state['values']['linkchecker_clear_analyze']); + + // Save form settings. + system_settings_form_submit($form, $form_state); + + db_query("TRUNCATE TABLE {linkchecker_nodes}"); + db_query("TRUNCATE TABLE {linkchecker_comments}"); + db_query("TRUNCATE TABLE {linkchecker_boxes}"); + db_query("TRUNCATE TABLE {linkchecker_links}"); + + // Start batch and analyze all nodes. + $node_types = array_keys(array_filter(variable_get('linkchecker_scan_nodetypes', array()))); + if (!empty($node_types)) { + module_load_include('inc', 'linkchecker', '/includes/linkchecker.batch'); + batch_set(_linkchecker_batch_import_nodes($node_types)); + if (variable_get('linkchecker_scan_comments', 0)) { + batch_set(_linkchecker_batch_import_comments($node_types)); + } + } + + if (variable_get('linkchecker_scan_blocks', 0)) { + module_load_include('inc', 'linkchecker', '/includes/linkchecker.batch'); + batch_set(_linkchecker_batch_import_boxes()); + } +} diff --git a/sites/all/modules/linkchecker/includes/linkchecker.batch.inc b/sites/all/modules/linkchecker/includes/linkchecker.batch.inc new file mode 100644 index 0000000..478ec04 --- /dev/null +++ b/sites/all/modules/linkchecker/includes/linkchecker.batch.inc @@ -0,0 +1,280 @@ + drupal_get_path('module', 'linkchecker') .'/includes/linkchecker.batch.inc', + 'finished' => '_linkchecker_batch_node_import_finished', + 'operations' => $operations, + 'title' => t('Scanning for links'), + ); + + return $batch; +} + +/** + * Batch operation: Scan ony by one node for links. + */ +function _linkchecker_batch_node_import_op($nid, &$context) { + // Load the node and scan for links. + $node = node_load($nid, NULL, TRUE); + _linkchecker_add_node_links($node); + + // Store results for post-processing in the finished callback. + $context['results'][] = $node->nid; + $context['message'] = t('Node: @title', array('@title' => $node->title)); +} + +function _linkchecker_batch_node_import_finished($success, $results, $operations) { + if ($success) { + $message = format_plural(count($results), 'One node has been scanned.', '@count nodes have been scanned.'); + } + else { + $message = t('Scanning for links in nodes have failed with an error.'); + } + drupal_set_message($message); +} + +/** + * Batch: Scan comments for links. + */ +function _linkchecker_batch_import_comments($node_types = array()) { + // Get all active {comments}.cid's. + $result = db_query('SELECT c.cid FROM {comments} c INNER JOIN {node} n ON c.nid = n.nid WHERE c.status = %d AND n.status = %d AND n.type IN (' . db_placeholders($node_types, 'varchar') . ') ORDER BY c.cid', array_merge(array(COMMENT_PUBLISHED, 1), $node_types)); + + $operations = array(); + while ($row = db_fetch_array($result)) { + $operations[] = array('_linkchecker_batch_comments_import_op', array($row['cid'])); + } + $batch = array( + 'file' => drupal_get_path('module', 'linkchecker') .'/includes/linkchecker.batch.inc', + 'finished' => '_linkchecker_batch_comments_import_finished', + 'operations' => $operations, + 'title' => t('Scanning for links'), + ); + + return $batch; +} + +/** + * Batch operation: Scan ony by one comment for links. + */ +function _linkchecker_batch_comments_import_op($cid, &$context) { + // Load the comment and scan for links. + $comment = _linkchecker_comment_load($cid); + _linkchecker_add_comment_links($comment); + + // Store results for post-processing in the finished callback. + $context['results'][] = $comment['cid']; + $context['message'] = t('Comment: @title', array('@title' => $comment['subject'])); +} + +function _linkchecker_batch_comments_import_finished($success, $results, $operations) { + if ($success) { + $message = format_plural(count($results), 'One comment has been scanned.', '@count comments have been scanned.'); + } + else { + $message = t('Scanning for links in comments have failed with an error.'); + } + drupal_set_message($message); +} + +/** + * Batch: Scan blocks for links. + */ +function _linkchecker_batch_import_boxes() { + // Get all {boxes}.bid's as block module suxxx and has no usable hooks. + $result = db_query('SELECT b.bid FROM {boxes} b ORDER BY b.bid'); + + $operations = array(); + while ($row = db_fetch_array($result)) { + $operations[] = array('_linkchecker_batch_import_boxes_op', array($row['bid'])); + } + $batch = array( + 'file' => drupal_get_path('module', 'linkchecker') .'/includes/linkchecker.batch.inc', + 'finished' => '_linkchecker_batch_box_import_finished', + 'operations' => $operations, + 'title' => t('Scanning for links'), + ); + + return $batch; +} + +/** + * Batch operation: Scan ony by one block for links. + */ +function _linkchecker_batch_import_boxes_op($bid, &$context) { + // Load the box and scan for links. + $box = block_box_get($bid); + _linkchecker_add_box_links($box, $box['bid']); + + // Store some result for post-processing in the finished callback. + $context['results'][] = $box['bid']; + $context['message'] = t('Block: @title', array('@title' => $box['info'])); +} + +function _linkchecker_batch_box_import_finished($success, $results, $operations) { + if ($success) { + $message = format_plural(count($results), 'One block has been scanned.', '@count blocks have been scanned.'); + } + else { + $message = t('Scanning for links in blocks have failed with an error.'); + } + drupal_set_message($message); +} + +/** + * Recurring scans of a single node via batch API. + * + * @param int $nid + * The unique node id to scan for links. + * @param int $missing_links_count + * The number of links not yet added to linkchecker_links table. By this + * number the re-scan rounds are calulated. + * @return + * The batch task definition. + */ +function _linkchecker_batch_import_single_node($nid, $missing_links_count) { + $operations = array(); + for ($i = 0; $i <= $missing_links_count; $i = $i+LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) { + $operations[] = array('_linkchecker_batch_single_node_import_op', array($nid)); + } + $batch = array( + 'file' => drupal_get_path('module', 'linkchecker') .'/includes/linkchecker.batch.inc', + 'finished' => '_linkchecker_batch_single_node_import_finished', + 'operations' => $operations, + 'title' => t('Scanning for links'), + 'progress_message' => t('Remaining @remaining of @total scans.'), + ); + + return $batch; +} + +function _linkchecker_batch_single_node_import_op($nid, &$context) { + // Load the node and scan for links. + $node = node_load($nid, NULL, TRUE); + _linkchecker_add_node_links($node, TRUE); + + // Store results for post-processing in the finished callback. + $context['results'][] = $node->nid; + $context['message'] = t('Node: @title', array('@title' => $node->title)); +} + +function _linkchecker_batch_single_node_import_finished($success, $results, $operations) { + if ($success) { + $message = format_plural(count($results), 'Node @nid has been re-scanned once to collect all links.', 'Node @nid has been re-scanned @count times to collect all links.', array('@nid' => $results[0])); + } + else { + $message = t('Recurring scanning for links in node @nid have failed with an error.', array('@nid' => $results[0])); + } + drupal_set_message($message); +} + +/** + * Recurring scans of a single comment via batch API. + * + * @param int $cid + * The unique comment id to scan for links. + * @param int $missing_links_count + * The number of links not yet added to linkchecker_links table. By this + * number the re-scan rounds are calulated. + * @return + * The batch task definition. + */ +function _linkchecker_batch_import_single_comment($cid, $missing_links_count) { + $operations = array(); + for ($i = 0; $i <= $missing_links_count; $i = $i+LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) { + $operations[] = array('_linkchecker_batch_single_comment_import_op', array($cid)); + } + $batch = array( + 'file' => drupal_get_path('module', 'linkchecker') .'/includes/linkchecker.batch.inc', + 'finished' => '_linkchecker_batch_single_comment_import_finished', + 'operations' => $operations, + 'title' => t('Scanning for links'), + 'progress_message' => t('Remaining @remaining of @total scans.'), + ); + + return $batch; +} + +function _linkchecker_batch_single_comment_import_op($cid, &$context) { + $comment = _linkchecker_comment_load($cid); + _linkchecker_add_comment_links($comment, TRUE); + + // Store results for post-processing in the finished callback. + $context['results'][] = $comment['cid']; + $context['message'] = t('Comment: @title', array('@title' => $comment['subject'])); +} + +function _linkchecker_batch_single_comment_import_finished($success, $results, $operations) { + if ($success) { + $message = format_plural(count($results), 'Comment @cid has been re-scanned once to collect all links.', 'Comment @cid has been re-scanned @count times to collect all links.', array('@cid' => $results[0])); + } + else { + $message = t('Recurring scanning for links in comment @cid have failed with an error.', array('@cid' => $results[0])); + } + drupal_set_message($message); +} + +/** + * Recurring scans of a single block via batch API. + * + * @param int $bid + * The unique block id to scan for links. + * @param int $missing_links_count + * The number of links not yet added to linkchecker_links table. By this + * number the re-scan rounds are calulated. + * @return + * The batch task definition. + */ +function _linkchecker_batch_import_single_box($bid, $missing_links_count) { + $operations = array(); + for ($i = 0; $i <= $missing_links_count; $i = $i+LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) { + $operations[] = array('_linkchecker_batch_single_box_import_op', array($bid)); + } + $batch = array( + 'file' => drupal_get_path('module', 'linkchecker') .'/includes/linkchecker.batch.inc', + 'finished' => '_linkchecker_batch_single_box_import_finished', + 'operations' => $operations, + 'title' => t('Scanning for links'), + 'progress_message' => t('Remaining @remaining of @total scans.'), + ); + + return $batch; +} + +function _linkchecker_batch_single_box_import_op($bid, &$context) { + // Load the box and scan for links. + $box = block_box_get($bid); + _linkchecker_add_box_links($box, $box['bid'], TRUE); + + // Store some result for post-processing in the finished callback. + $context['results'][] = $box['bid']; + $context['message'] = t('Block: @title', array('@title' => $box['info'])); +} + +function _linkchecker_batch_single_box_import_finished($success, $results, $operations) { + if ($success) { + $message = format_plural(count($results), 'Block @bid has been re-scanned once to collect all links.', 'Block @bid has been re-scanned @count times to collect all links.', array('@bid' => $results[0])); + } + else { + $message = t('Recurring scanning for links in block @bid have failed with an error.', array('@bid' => $results[0])); + } + drupal_set_message($message); +} diff --git a/sites/all/modules/linkchecker/includes/linkchecker.pages.inc b/sites/all/modules/linkchecker/includes/linkchecker.pages.inc new file mode 100644 index 0000000..5cd12f0 --- /dev/null +++ b/sites/all/modules/linkchecker/includes/linkchecker.pages.inc @@ -0,0 +1,256 @@ + %d AND ll.status = %d AND ll.code NOT IN (" . db_placeholders($ignore_response_codes, 'int') . ")"; + + // Build the array variable with all parameters for the pager_query. + $links_report_parameters = array_merge(array(0, 1), $ignore_response_codes); + + return _linkchecker_report_page($links_report_sql, $links_report_parameters); +} + +/** + * Menu callback for author specific reporting. + */ +function linkchecker_user_report_page($account) { + drupal_set_title(check_plain($account->name)); + + $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403")); + + // Search for broken links in nodes and comments of the current user. + if (module_exists('comment') && variable_get('linkchecker_scan_comments', 0)) { + $links_report_sql = "SELECT ll.* + FROM {linkchecker_links} ll + INNER JOIN ( + SELECT lid FROM ( + SELECT DISTINCT ll.lid + FROM {node} n + INNER JOIN {node_revisions} r ON r.vid = n.vid + INNER JOIN {linkchecker_nodes} ln ON ln.nid = n.nid + INNER JOIN {linkchecker_links} ll ON ll.lid = ln.lid AND ll.last_checked <> %d AND ll.status = %d AND ll.code NOT IN (" . db_placeholders($ignore_response_codes, 'int') . ") + WHERE n.uid = %d OR r.uid = %d + UNION + SELECT DISTINCT ll.lid + FROM {comments} c + INNER JOIN {linkchecker_comments} lc ON lc.cid = c.cid + INNER JOIN {linkchecker_links} ll ON ll.lid = lc.lid AND ll.last_checked <> %d AND ll.status = %d AND ll.code NOT IN (" . db_placeholders($ignore_response_codes, 'int') . ") + WHERE c.uid = %d + ) q1 + ) q2 ON q2.lid = ll.lid"; + + // Build the array variable with all parameters for the pager_query with comment module enabled. + $links_report_parameters = array_merge(array(0, 1), $ignore_response_codes, array($account->uid, $account->uid, 0, 1), $ignore_response_codes, array($account->uid)); + } + else { + // Search for broken links in nodes of the current user. + $links_report_sql = "SELECT ll.* + FROM {linkchecker_links} ll + INNER JOIN ( + SELECT lid FROM ( + SELECT DISTINCT ll.lid + FROM {node} n + INNER JOIN {node_revisions} r ON r.vid = n.vid + INNER JOIN {linkchecker_nodes} ln ON ln.nid = n.nid + INNER JOIN {linkchecker_links} ll ON ll.lid = ln.lid AND ll.last_checked <> %d AND ll.status = %d AND ll.code NOT IN (" . db_placeholders($ignore_response_codes, 'int') . ") + WHERE n.uid = %d OR r.uid = %d + ) q1 + ) q2 ON q2.lid = ll.lid"; + + // Build the array variable with all parameters for the pager_query with comment module disabled. + $links_report_parameters = array_merge(array(0, 1), $ignore_response_codes, array($account->uid, $account->uid)); + } + + return _linkchecker_report_page($links_report_sql, $links_report_parameters, $account); +} + +function _linkchecker_report_page($links_report_sql, $links_report_parameters = NULL, $account = NULL) { + + $links_unchecked = db_result(db_query('SELECT COUNT(1) FROM {linkchecker_links} WHERE last_checked = %d AND status = %d', 0, 1)); + if ($links_unchecked > 0) { + $links_all = db_result(db_query('SELECT COUNT(1) FROM {linkchecker_links} WHERE status = %d', 1)); + drupal_set_message(format_plural($links_unchecked, + 'There is 1 unchecked link of about @links_all links in the database. Please be patient until all links have been checked via cron.', + 'There are @count unchecked links of about @links_all links in the database. Please be patient until all links have been checked via cron.', + array('@links_all' => $links_all)), 'warning'); + } + + $header = array( + array('data' => t('URL'), 'field' => 'url', 'sort' => 'desc'), + array('data' => t('Response'), 'field' => 'code', 'sort' => 'desc'), + array('data' => t('Error'), 'field' => 'error'), + array('data' => t('Operations')), + ); + + $result = pager_query($links_report_sql . tablesort_sql($header), 50, 0, NULL, $links_report_parameters); + + // Evaluate permission once for performance reasons. + $access_edit_link_settings = user_access('edit link settings'); + $access_administer_blocks = user_access('administer blocks'); + $access_administer_redirects = user_access('administer redirects'); + + $rows = array(); + while ($link = db_fetch_object($result)) { + // Get the node, block and comment IDs that refer to this broken link and + // that the current user has access to. + $nids = _linkchecker_link_node_ids($link, $account); + $cids = _linkchecker_link_comment_ids($link, $account); + $bids = _linkchecker_link_block_ids($link); + + // If the user does not have access to see this link anywhere, do not + // display it, for reasons explained in _linkchecker_link_access(). We + // still need to fill the table row, though, so as not to throw off the + // number of items in the pager. + if (empty($nids) && empty($cids) && empty($bids)) { + $rows[] = array(array('data' => t('Permission restrictions deny you access to this broken link.'), 'colspan' => count($header))); + continue; + } + + $links = array(); + + // Show links to link settings. + if ($access_edit_link_settings) { + $links[] = l(t('Edit link settings'), 'linkchecker/' . $link->lid . '/edit', array('query' => drupal_get_destination())); + } + + // Show link to nodes having this broken link. + foreach ($nids as $nid) { + $links[] = l(t('Edit node @node', array('@node' => $nid)), 'node/' . $nid . '/edit', array('query' => drupal_get_destination())); + } + + // Show link to comments having this broken link. + if (module_exists('comment') && variable_get('linkchecker_scan_comments', 0)) { + foreach ($cids as $cid) { + $links[] = l(t('Edit comment @comment', array('@comment' => $cid)), 'comment/edit/' . $cid, array('query' => drupal_get_destination())); + } + } + + // Show link to blocks having this broken link. + if ($access_administer_blocks) { + foreach ($bids as $bid) { + $links[] = l(t('Edit block @block', array('@block' => $bid)), 'admin/build/block/configure/block/' . $bid, array('query' => drupal_get_destination())); + } + } + + // Show link to redirect this broken internal link. + if (module_exists('path_redirect') && $access_administer_redirects && _linkchecker_is_internal_url($link)) { + $links[] = l(t('Create redirection'), 'admin/build/path-redirect/add', array('query' => array('source' => $link->internal))); + } + + // Create table data for output. Use inline styles to prevent extra CSS file. + $rows[] = array( + l(_filter_url_trim($link->url, 40), $link->url), + $link->code, + check_plain($link->error), + theme('item_list', $links), + ); + } + + if (empty($rows)) { + $rows[] = array(array('data' => t('No broken links have been found.'), 'colspan' => count($header))); + } + + $output = theme('table', $header, $rows); + $output .= theme('pager', NULL, 3000, 0); + return $output; +} + +function linkchecker_link_edit_form(&$form_state, $link) { + + $form['settings'] = array( + '#type' => 'fieldset', + '#title' => t('Settings'), + '#collapsible' => FALSE, + '#description' => t('The link @url was last checked on @last_checked and failed @fail_count times.', array('@url' => $link['url'], '@fail_count' => $link['fail_count'], '@last_checked' => format_date($link['last_checked']))) + ); + + $form['settings']['lid'] = array('#type' => 'hidden', '#value' => $link['lid']); + $form['settings']['url'] = array('#type' => 'hidden', '#value' => $link['url']); + + $form['settings']['method'] = array( + '#type' => 'select', + '#title' => t('Select request method'), + '#default_value' => $link['method'], + '#options' => array( + 'HEAD' => t('HEAD'), + 'GET' => t('GET'), + ), + '#description' => t('Select the request method used for link checks of this link. If you encounter issues like status code 500 errors with the HEAD request method you should try the GET request method before ignoring a link.'), + ); + + $form['settings']['status'] = array( + '#default_value' => $link['status'], + '#type' => 'checkbox', + '#title' => t('Check link status'), + '#description' => t("Uncheck if you wish to ignore this link. Use this setting only as a last resort if there is no other way to solve a failed link check."), + ); + + $form['maintenance'] = array( + '#type' => 'fieldset', + '#title' => t('Maintenance'), + '#collapsible' => FALSE, + ); + + $form['maintenance']['recheck'] = array( + '#default_value' => 0, + '#type' => 'checkbox', + '#title' => t('Re-check link status on next cron run'), + '#description' => t('Enable this checkbox if you want to re-check the link during the next cron job rather than wait for the next scheduled check on @date.', array('@date' => format_date($link['last_checked'] + variable_get('linkchecker_check_links_interval', 2419200)))), + ); + + $form['buttons']['submit'] = array('#type' => 'submit', '#value' => t('Save configuration')); + $form['buttons']['reset'] = array('#type' => 'submit', '#value' => t('Reset to defaults')); + + return $form; +} + +function linkchecker_link_edit_form_submit($form, &$form_state) { + // Force asap link re-check. + if ($form_state['values']['recheck']) { + db_query("UPDATE {linkchecker_links} SET last_checked = %d WHERE lid = %d", 0, $form_state['values']['lid']); + drupal_set_message(t('The link %url will be checked again on the next cron run.', array('%url' => $form_state['values']['url']))); + } + + if ($form_state['values']['method'] != $form['settings']['method']['#default_value']) { + // Update settings and reset statistics for a quick re-check. + db_query("UPDATE {linkchecker_links} SET method = '%s', fail_count = %d, last_checked = %d, status = %d WHERE lid = %d", $form_state['values']['method'], 0, 0, $form_state['values']['status'], $form_state['values']['lid']); + drupal_set_message(t('The link settings for %url have been saved and the fail counter has been reset.', array('%url' => $form_state['values']['url']))); + } + else { + // Update setting only. + db_query("UPDATE {linkchecker_links} SET method = '%s', status = %d WHERE lid = %d", $form_state['values']['method'], $form_state['values']['status'], $form_state['values']['lid']); + drupal_set_message(t('The link settings for %url have been saved.', array('%url' => $form_state['values']['url']))); + } +} + +function _linkchecker_is_internal_url(&$link) { + global $base_url; + + if (strpos($link->url, $base_url) === 0) { + $link->internal = trim(substr($link->url, strlen($base_url)), " \t\r\n\0\\/"); + return TRUE; + } +} diff --git a/sites/all/modules/linkchecker/linkchecker.info b/sites/all/modules/linkchecker/linkchecker.info new file mode 100644 index 0000000..909e60c --- /dev/null +++ b/sites/all/modules/linkchecker/linkchecker.info @@ -0,0 +1,10 @@ +name = Link checker +description = "Periodically checks for broken links in node types, blocks and cck fields and reports the results." +core = 6.x + +; Information added by Drupal.org packaging script on 2014-06-07 +version = "6.x-2.8" +core = "6.x" +project = "linkchecker" +datestamp = "1402132729" + diff --git a/sites/all/modules/linkchecker/linkchecker.install b/sites/all/modules/linkchecker/linkchecker.install new file mode 100644 index 0000000..d603c98 --- /dev/null +++ b/sites/all/modules/linkchecker/linkchecker.install @@ -0,0 +1,670 @@ +name); +} + +/** + * Implementation of hook_uninstall(). + */ +function linkchecker_uninstall() { + drupal_uninstall_schema('linkchecker'); + + variable_del('linkchecker_action_status_code_301'); + variable_del('linkchecker_action_status_code_404'); + variable_del('linkchecker_check_connections_max'); + variable_del('linkchecker_check_library'); + variable_del('linkchecker_check_links_interval'); + variable_del('linkchecker_check_useragent'); + variable_del('linkchecker_cleanup_links_last'); + variable_del('linkchecker_disable_link_check_for_urls'); + variable_del('linkchecker_extract_from_a'); + variable_del('linkchecker_extract_from_audio'); + variable_del('linkchecker_extract_from_embed'); + variable_del('linkchecker_extract_from_iframe'); + variable_del('linkchecker_extract_from_img'); + variable_del('linkchecker_extract_from_object'); + variable_del('linkchecker_extract_from_source'); + variable_del('linkchecker_extract_from_video'); + variable_del('linkchecker_filter_blacklist'); + variable_del('linkchecker_fqdn_only'); + variable_del('linkchecker_ignore_response_codes'); + variable_del('linkchecker_impersonate_user'); + variable_del('linkchecker_scan_blocks'); + variable_del('linkchecker_scan_comments'); + variable_del('linkchecker_scan_nodetypes'); +} + +/** + * Implementation of hook_schema(). + */ +function linkchecker_schema() { + + $schema['linkchecker_links'] = array( + 'description' => 'Stores all links.', + 'fields' => array( + 'lid' => array( + 'type' => 'serial', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique link ID.', + ), + 'urlhash' => array( + 'type' => 'varchar', + 'length' => 32, + 'not null' => TRUE, + 'description' => 'The indexable md5 hash of the {linkchecker_links}.url.', + ), + 'url' => array( + 'type' => 'text', + 'not null' => TRUE, + 'description' => 'The full qualified link.', + ), + 'method' => array( + 'type' => 'varchar', + 'length' => 4, + 'default' => 'HEAD', + 'not null' => TRUE, + 'description' => 'The method for checking links (HEAD, GET, POST).', + ), + 'code' => array( + 'type' => 'int', + 'not null' => TRUE, + 'default' => -1, + 'description' => 'HTTP status code from link checking.', + ), + 'error' => array( + 'type' => 'text', + 'not null' => FALSE, + 'description' => 'The error message received from the remote server while doing link checking.', + ), + 'fail_count' => array( + 'type' => 'int', + 'not null' => TRUE, + 'default' => 0, + 'description' => 'Fail count of unsuccessful link checks. No flapping detection. (Successful = 0, Unsuccessful = fail_count+1).', + ), + 'last_checked' => array( + 'type' => 'int', + 'not null' => TRUE, + 'default' => 0, + 'description' => 'Timestamp of the last link check.', + ), + 'status' => array( + 'type' => 'int', + 'not null' => TRUE, + 'default' => 1, + 'description' => 'Boolean indicating if a link should be checked or not.', + ), + ), + 'primary key' => array('lid'), + 'unique keys' => array('urlhash' => array('urlhash')), + 'indexes' => array( + 'method' => array('method'), + 'code' => array('code'), + 'fail_count' => array('fail_count'), + 'last_checked' => array('last_checked'), + 'status' => array('status'), + ), + ); + + $schema['linkchecker_boxes'] = array( + 'description' => 'Stores all link references for boxes.', + 'fields' => array( + 'bid' => array( + 'type' => 'int', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique {boxes}.bid.', + ), + 'lid' => array( + 'type' => 'int', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique {linkchecker_links}.lid.', + ), + ), + 'primary key' => array('bid', 'lid'), + 'indexes' => array('lid' => array('lid')), + ); + + $schema['linkchecker_nodes'] = array( + 'description' => 'Stores all link references for nodes.', + 'fields' => array( + 'nid' => array( + 'type' => 'int', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique {node}.nid.', + ), + 'lid' => array( + 'type' => 'int', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique {linkchecker_links}.lid.', + ), + ), + 'primary key' => array('nid', 'lid'), + 'indexes' => array('lid' => array('lid')), + ); + + $schema['linkchecker_comments'] = array( + 'description' => 'Stores all link references for comments.', + 'fields' => array( + 'cid' => array( + 'type' => 'int', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique {comments}.cid.', + ), + 'lid' => array( + 'type' => 'int', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique {linkchecker_links}.lid.', + ), + ), + 'primary key' => array('cid', 'lid'), + 'indexes' => array('lid' => array('lid')), + ); + + return $schema; +} + +/** + * Implementation of hook_requirements(). + */ +function linkchecker_requirements($phase) { + $requirements = array(); + $t = get_t(); + + switch ($phase) { + case 'install' : + // MySQL version with subselect support is required. + $version = db_version(); + if (in_array($GLOBALS['db_type'], array('mysql', 'mysqli')) && version_compare($version, LINKCHECKER_MINIMUM_MYSQL) < 0) { + $requirements['linkchecker_minimum_mysql'] = array( + 'title' => $t('MySQL database'), + 'value' => $version, + 'severity' => REQUIREMENT_ERROR, + 'description' => $t('Your MySQL Server is too old. Link Checker requires at least MySQL %version.', array('%version' => LINKCHECKER_MINIMUM_MYSQL)), + ); + } + break; + } + + return $requirements; +} + +/** + * Upgrade module to new D5 schema. + */ +function linkchecker_update_5200() { + $ret = array(); + + // Module functions are required. Make sure the module is loaded. + drupal_load('module', 'linkchecker'); + + // Remove obsolete tables no longer required. + db_drop_table($ret, 'linkchecker_tasks'); + db_drop_table($ret, 'linkchecker_results'); + + // Create new tables. + $schema['linkchecker_links'] = array( + 'description' => 'Stores all links.', + 'fields' => array( + 'lid' => array( + 'type' => 'int', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique link ID.', + ), + 'token' => array( + 'type' => 'varchar', + 'length' => 32, + 'not null' => TRUE, + 'description' => 'The indexable md5 hash of the {linkchecker_links}.url.', + ), + 'url' => array( + 'type' => 'text', + 'not null' => TRUE, + 'description' => 'The full qualified link URL.', + ), + 'method' => array( + 'type' => 'varchar', + 'length' => 4, + 'default' => 'HEAD', + 'not null' => TRUE, + 'description' => 'The method for checking links (HEAD, GET, POST).', + ), + 'code' => array( + 'type' => 'int', + 'not null' => TRUE, + 'default' => 0, + 'description' => 'HTTP status code from link checking.', + ), + 'error' => array( + 'type' => 'text', + 'not null' => FALSE, + 'description' => 'The error message received from the remote server while doing link checking.', + ), + 'fail_count' => array( + 'type' => 'int', + 'not null' => TRUE, + 'default' => 0, + 'description' => 'Fail count of unsuccessful link checks. No flapping detection. (Successful = 0, Unsuccessful = fail_count+1).', + ), + 'last_checked' => array( + 'type' => 'int', + 'not null' => TRUE, + 'default' => 0, + 'description' => 'Timestamp of the last link check.', + ), + ), + 'primary key' => array('lid'), + 'unique keys' => array('token' => array('token')), + ); + + $schema['linkchecker_boxes'] = array( + 'description' => 'Stores all link references for boxes.', + 'fields' => array( + 'bid' => array( + 'type' => 'int', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique {boxes}.bid.', + ), + 'lid' => array( + 'type' => 'int', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique {linkchecker_links}.lid.', + ), + ), + 'primary key' => array('bid', 'lid'), + ); + + $schema['linkchecker_nodes'] = array( + 'description' => 'Stores all link references for nodes.', + 'fields' => array( + 'nid' => array( + 'type' => 'int', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique {node}.nid.', + ), + 'lid' => array( + 'type' => 'int', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique {linkchecker_links}.lid.', + ), + ), + 'primary key' => array('nid', 'lid'), + ); + + // Create schema. + db_create_table($ret, 'linkchecker_links', $schema['linkchecker_links']); + db_create_table($ret, 'linkchecker_boxes', $schema['linkchecker_boxes']); + db_create_table($ret, 'linkchecker_nodes', $schema['linkchecker_nodes']); + + // Upgrade settings. Could be less code, but is easier to follow. + $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_responses', "200\n304\n401\n403")); + + // Filter all invalid responds codes and outdated error messages out. + $ignore_response_codes = array_filter($ignore_response_codes, '_linkchecker_isvalid_response_code'); + + // Make sure we have status code 200 and 304 in the ignore list. + $ignore_response_codes = array_merge(array('200', '304'), $ignore_response_codes); + $ignore_response_codes = array_unique($ignore_response_codes); + + variable_set('linkchecker_ignore_response_codes', implode("\n", $ignore_response_codes)); + $ret[] = array('success' => TRUE, 'query' => 'Ignored response codes have been upgraded to '. implode(",", $ignore_response_codes)); + + // Remove obsolete settings. + variable_del('linkchecker_ignore_responses'); + variable_del('linkchecker_rebuild'); + variable_del('linkchecker_maxtime'); + variable_del('linkchecker_socket_timeout'); + variable_del('linkchecker_max_links_per_node'); + variable_del('linkchecker_remove_after'); + variable_del('linkchecker_give_up'); + + return $ret; +} + +/** + * Upgrade module to new D6 schema. + */ +function linkchecker_update_6200() { + $ret = array(); + + db_change_field($ret, 'linkchecker_links', 'lid', 'lid', array('type' => 'serial', 'not null' => TRUE)); + + // Upgrade line break converter from D5 filter/2 to D6 filter/1. + $linkchecker_filter_blacklist = variable_get('linkchecker_filter_blacklist', array()); + if (!empty($linkchecker_filter_blacklist) && $linkchecker_filter_blacklist['filter/2']) { + $linkchecker_filter_blacklist['filter/2'] = 0; + $linkchecker_filter_blacklist['filter/1'] = 'filter/1'; + variable_set('linkchecker_filter_blacklist', $linkchecker_filter_blacklist); + $ret[] = array('success' => TRUE, 'query' => 'Upgraded the default exclusion of line break converter filter from D5 filter/2 to D6 filter/1.'); + } + + return $ret; +} + +/** + * Install linkchecker_comments table. + */ +function linkchecker_update_6201() { + $ret = array(); + + // Create new tables. + $schema['linkchecker_comments'] = array( + 'description' => 'Stores all link references for comments.', + 'fields' => array( + 'cid' => array( + 'type' => 'int', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique {comments}.cid.', + ), + 'lid' => array( + 'type' => 'int', + 'not null' => TRUE, + 'description' => 'Primary Key: Unique {linkchecker_links}.lid.', + ), + ), + 'primary key' => array('cid', 'lid'), + ); + + // Table may be created in 5201 by 5.x-2.x. + if (!db_table_exists('linkchecker_comments')) { + db_create_table($ret, 'linkchecker_comments', $schema['linkchecker_comments']); + } + + return $ret; +} + +/** + * Remove unnecessary elements from custom submit buttons. + */ +function linkchecker_update_6202() { + $ret = array(); + + variable_del('linkchecker_analyze'); + variable_del('linkchecker_clear_analyze'); + + return $ret; +} + +/** + * Make fields unsigned. + */ +function linkchecker_update_6204() { + $ret = array(); + + // Make linkchecker_links.lid unsigned. + db_change_field($ret, 'linkchecker_links', 'lid', 'lid', array('type' => 'serial', 'unsigned' => TRUE, 'not null' => TRUE)); + if ($GLOBALS['db_type'] == 'pgsql') { + db_add_primary_key($ret, 'linkchecker_links', array('lid')); + } + + // Make linkchecker_boxes.bid and linkchecker_boxes.lid unsigned. + db_drop_primary_key($ret, 'linkchecker_boxes'); + db_change_field($ret, 'linkchecker_boxes', 'bid', 'bid', array('type' => 'int', 'unsigned' => TRUE, 'not null' => TRUE)); + db_change_field($ret, 'linkchecker_boxes', 'lid', 'lid', array('type' => 'int', 'unsigned' => TRUE, 'not null' => TRUE)); + db_add_primary_key($ret, 'linkchecker_boxes', array('bid', 'lid')); + + // Make linkchecker_comments.cid and linkchecker_comments.lid unsigned. + db_drop_primary_key($ret, 'linkchecker_comments'); + db_change_field($ret, 'linkchecker_comments', 'cid', 'cid', array('type' => 'int', 'unsigned' => TRUE, 'not null' => TRUE)); + db_change_field($ret, 'linkchecker_comments', 'lid', 'lid', array('type' => 'int', 'unsigned' => TRUE, 'not null' => TRUE)); + db_add_primary_key($ret, 'linkchecker_comments', array('cid', 'lid')); + + // Make linkchecker_nodes.nid and linkchecker_nodes.lid unsigned. + db_drop_primary_key($ret, 'linkchecker_nodes'); + db_change_field($ret, 'linkchecker_nodes', 'nid', 'nid', array('type' => 'int', 'unsigned' => TRUE, 'not null' => TRUE)); + db_change_field($ret, 'linkchecker_nodes', 'lid', 'lid', array('type' => 'int', 'unsigned' => TRUE, 'not null' => TRUE)); + db_add_primary_key($ret, 'linkchecker_nodes', array('nid', 'lid')); + + return $ret; +} + +/** + * Remove any references to nodes that are not published and also remove their comments references. + */ +function linkchecker_update_6205() { + $ret = array(); + + $ret[] = update_sql("DELETE FROM {linkchecker_nodes} WHERE nid IN (SELECT nid FROM {node} WHERE status = 0)"); + $ret[] = update_sql("DELETE FROM {linkchecker_comments} WHERE cid IN (SELECT c.cid FROM {node} n INNER JOIN {comments} c ON c.nid = n.nid WHERE n.status = 0)"); + + return $ret; +} + +/** + * Add status column to links table. + */ +function linkchecker_update_6206() { + $ret = array(); + + // Add the column only if not exists. Column may been added in D5 (5207). + if (!db_column_exists('linkchecker_links', 'status')) { + db_add_field($ret, 'linkchecker_links', 'status', array('type' => 'int', 'not null' => TRUE, 'default' => 1)); + } + + return $ret; +} + +/** + * Remove obsolete variable. + */ +function linkchecker_update_6207() { + $ret = array(); + + variable_del('linkchecker_cleanup_links_interval'); + + return $ret; +} + +/** + * Change default value for linkchecker_links.code to -1. + * + * fsockopen may return 0 as an indication that the error occurred before + * the connect() call. This is most likely due to a problem initializing the + * socket. + */ +function linkchecker_update_6208() { + $ret = array(); + + db_change_field($ret, 'linkchecker_links', 'code', 'code', array('type' => 'int', 'not null' => TRUE, 'default' => -1)); + + return $ret; +} + +/** + * Update the permissions table, to reflect changes to hook_perm. + */ +function linkchecker_update_6209() { + $ret = array(); + + $res = db_query('SELECT rid, perm FROM {permission}'); + $perms = array(); + while ($p = db_fetch_object($res)) { + $perm = $p->perm; + $perm = preg_replace('/access linkchecker/', 'access broken links report', $perm); + $perms[$p->rid] = $perm; + } + + foreach ($perms as $rid => $renamed_permission) { + db_query("UPDATE {permission} SET perm = '%s' WHERE rid = %d", $renamed_permission, $rid); + $ret[] = array('success' => TRUE, 'query' => 'UPDATE {permission} SET perm = ' . check_plain($renamed_permission) . ' WHERE rid = '. $rid); + } + + return $ret; +} + +/** + * Add status code 302 to the default list of ignored response codes. + */ +function linkchecker_update_6210() { + $ret = array(); + + $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n302\n304\n401\n403")); + if (!in_array('302', $ignore_response_codes)) { + $ignore_response_codes[] = '302'; + sort($ignore_response_codes); + variable_set('linkchecker_ignore_response_codes', implode("\n", $ignore_response_codes)); + $ret[] = array('success' => TRUE, 'query' => 'Added the status code 302 to the list of ignored response codes.'); + } + else { + $ret[] = array('success' => TRUE, 'query' => 'No action taken. The status code 302 was already on the list of ignored response codes.'); + } + + return $ret; +} + +/** + * Remove obsolete variable. + */ +function linkchecker_update_6211() { + $ret = array(); + + variable_del('linkchecker_pathfilter_support'); + $ret[] = array('success' => TRUE, 'query' => 'Removed obsolete linkchecker_pathfilter_support variable.'); + + return $ret; +} + +/** + * Change unique key and column name 'token' into 'urlhash'. + */ +function linkchecker_update_6212() { + $ret = array(); + + db_drop_unique_key($ret, 'linkchecker_links', 'token'); + db_change_field($ret, 'linkchecker_links', 'token', 'urlhash', array('type' => 'varchar', 'length' => 32, 'not null' => TRUE)); + db_add_unique_key($ret, 'linkchecker_links', 'urlhash', array('urlhash')); + + return $ret; +} + +/** + * Add RFC documenation domains back if they have been removed. + */ +function linkchecker_update_6213() { + $ret = array(); + + $linkchecker_disable_link_check_for_urls = array_filter(preg_split('/(\r\n?|\n)/', variable_get('linkchecker_disable_link_check_for_urls', LINKCHECKER_RESERVED_DOCUMENTATION_DOMAINS))); + variable_set('linkchecker_disable_link_check_for_urls', implode("\n", array_unique(array_merge(explode("\n", LINKCHECKER_RESERVED_DOCUMENTATION_DOMAINS), $linkchecker_disable_link_check_for_urls)))); + $ret[] = array('success' => TRUE, 'query' => 'Added RFC documenation domains back if they have been removed.'); + + return $ret; +} + +/** +* #1321378: Improve performance of queries. +*/ +function linkchecker_update_6214() { + $ret = array(); + + db_add_index($ret, 'linkchecker_boxes', 'lid', array('lid')); + db_add_index($ret, 'linkchecker_comments', 'lid', array('lid')); + db_add_index($ret, 'linkchecker_nodes', 'lid', array('lid')); + + return $ret; +} + +/** +* Add status code 206 to the default list of ignored response codes. +*/ +function linkchecker_update_6215() { + $ret = array(); + + $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403")); + if (!in_array('206', $ignore_response_codes)) { + $ignore_response_codes[] = '206'; + sort($ignore_response_codes); + variable_set('linkchecker_ignore_response_codes', implode("\n", $ignore_response_codes)); + $ret[] = array('success' => TRUE, 'query' => 'Added the status code 206 to the list of ignored response codes.'); + } + else { + $ret[] = array('success' => TRUE, 'query' => 'No action taken. The status code 206 was already on the list of ignored response codes.'); + } + + return $ret; +} + +/** +* #965720: Add indexes to improve performance of views queries. +*/ +function linkchecker_update_6216() { + $ret = array(); + + db_add_index($ret, 'linkchecker_links', 'method', array('method')); + db_add_index($ret, 'linkchecker_links', 'code', array('code')); + db_add_index($ret, 'linkchecker_links', 'fail_count', array('fail_count')); + db_add_index($ret, 'linkchecker_links', 'last_checked', array('last_checked')); + db_add_index($ret, 'linkchecker_links', 'status', array('status')); + + return $ret; +} + +/** + * #1450672: Set user 1 as default user to impersonate content updates. + */ +function linkchecker_update_6217() { + $ret = array(); + + $linkchecker_default_impersonate_user = user_load(1); + variable_set('linkchecker_impersonate_user', $linkchecker_default_impersonate_user->name); + $ret[] = array('success' => TRUE, 'query' => 'Set user 1 as default user to impersonate content updates.'); + + return $ret; +} + +/** + * Removed obsolete linkchecker_check_links_max variable. + */ +function linkchecker_update_6218() { + $ret = array(); + + variable_del('linkchecker_check_links_max'); + $ret[] = array('success' => TRUE, 'query' => 'Removed obsolete linkchecker_check_links_max variable.'); + + return $ret; +} + +/** + * Upgrade outdated HTTP user agents. + */ +function linkchecker_update_6219() { + $ret = array(); + + $linkchecker_check_useragent = variable_get('linkchecker_check_useragent', 'Drupal (+http://drupal.org/)'); + + $useragent_upgrade = array( + 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;)' => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', + 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;)' => 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)', + 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5' => 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:17.0) Gecko/20100101 Firefox/17.0', + 'Mozilla/5.0 (Windows; U; Windows NT 6.0; de; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5' => 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:17.0) Gecko/20100101 Firefox/17.0', + ); + + if (isset($useragent_upgrade[$linkchecker_check_useragent])) { + variable_set('linkchecker_check_useragent', $useragent_upgrade[$linkchecker_check_useragent]); + $ret[] = array('success' => TRUE, 'query' => 'Upgraded outdated HTTP user agent.'); + } + else { + $ret[] = array('success' => TRUE, 'query' => 'User agent already up to date.'); + } + + return $ret; +} diff --git a/sites/all/modules/linkchecker/linkchecker.module b/sites/all/modules/linkchecker/linkchecker.module new file mode 100644 index 0000000..7efb2c7 --- /dev/null +++ b/sites/all/modules/linkchecker/linkchecker.module @@ -0,0 +1,1959 @@ + ] + * - Insert view filter, http://drupal.org/project/insert_view + * name: insert_view/0 + * tags: [view:my_view] + * - Smileys filter, http://drupal.org/project/smiley + * name: smiley/0 + * tags: Depends on icon set, for e.g: ":) :-) :smile:" + * - Smileys Filter, http://drupal.org/project/smileys + * name: smileys/0 + * tags: Depends on icon set, for e.g: ":) :-) :smile:" + * - Weblink filter, http://drupal.org/project/links + * name: links_weblink/0 + * tags: [weblink:node_id|text], [weblink:node_id/link_id], [weblink:http://weblink.example.com/] + * - Web Links Embed, http://drupal.org/project/weblinks + * name: weblinks_embed/0 + * tags: [links-embed: id], [links-embed: name] + * - Web Links Filter, http://drupal.org/project/weblinks + * name: weblinks_filter/0 + * tags: [link: title] + */ +define('LINKCHECKER_DEFAULT_FILTER_BLACKLIST', 'filter/1|insert_block/0|insert_node/0|insert_view/0|smiley/0|smileys/0|links_weblink/0|weblinks_embed/0|weblinks_filter/0'); + +/** + * Implementation of hook_perm(). + */ +function linkchecker_perm() { + return array('access broken links report', 'access own broken links report', 'administer linkchecker', 'edit link settings'); +} + +/** + * Implementation of hook_help(). + */ +function linkchecker_help($path, $arg) { + switch ($path) { + case 'admin/help#linkchecker': + return '

' . t('This module provides an aid to finding broken links on your site. It periodically checks contents of all public nodes, tries to find any html links and check for their validity. It reports broken links through the admin interface. For more information about status codes see Status Code Definitions.', array('@rfc' => 'http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html')) . '

'; + } +} + +/** + * Implementation of hook_menu(). + */ +function linkchecker_menu() { + + $items['admin/settings/linkchecker'] = array( + 'title' => 'Link checker', + 'description' => 'Configure the content types that should be checked for broken links and how the hypertext links will be checked and reported and repaired.', + 'page callback' => 'drupal_get_form', + 'page arguments' => array('linkchecker_admin_settings_form'), + 'access arguments' => array('administer linkchecker'), + 'file' => 'includes/linkchecker.admin.inc', + ); + $items['admin/reports/linkchecker'] = array( + 'title' => 'Broken links', + 'description' => 'Shows a list of broken links in content.', + 'page callback' => 'linkchecker_admin_report_page', + 'type' => MENU_NORMAL_ITEM, + 'access arguments' => array('access broken links report'), + 'file' => 'includes/linkchecker.pages.inc', + ); + // Add the user menu item after node/edit tab. + $items['user/%user/linkchecker'] = array( + 'title' => 'Broken links', + 'description' => 'Shows a list of broken links in content.', + 'page callback' => 'linkchecker_user_report_page', + 'page arguments' => array(1), + 'type' => MENU_LOCAL_TASK, + 'access callback' => '_linkchecker_user_access_account_broken_links_report', + 'access arguments' => array(1), + 'file' => 'includes/linkchecker.pages.inc', + 'weight' => 3, + ); + $items['linkchecker/%linkchecker_link/edit'] = array( + 'title' => 'Edit link settings', + 'page callback' => 'drupal_get_form', + 'page arguments' => array('linkchecker_link_edit_form', 1), + 'access callback' => '_linkchecker_user_access_edit_link_settings', + 'access arguments' => array(1), + 'file' => 'includes/linkchecker.pages.inc', + 'type' => MENU_CALLBACK, + ); + + return $items; +} + +/** + * Access callback for user/%user/linkchecker. + */ +function _linkchecker_user_access_account_broken_links_report($account) { + global $user; + + // Users with 'access own broken links report' permission can only view their + // own report. Users with the 'access broken links report' permission can + // view the report for any authenticated user. + return $account->uid && (($user->uid == $account->uid && user_access('access own broken links report')) || user_access('access broken links report')); +} + +/** + * Access callback for linkchecker/%linkchecker_link/edit. + */ +function _linkchecker_user_access_edit_link_settings($link) { + return user_access('edit link settings') && _linkchecker_link_access($link); +} + +/** + * Determines if the current user has access to view a link. + * + * Link URLs can contain private information (for example, usernames and + * passwords). So this module should only display links to a user if the link + * already appears in at least one place on the site where the user would + * otherwise have access to see it. + */ +function _linkchecker_link_access($link) { + $link = (object) $link; + return _linkchecker_link_node_ids($link) || _linkchecker_link_comment_ids($link) || _linkchecker_link_block_ids($link); +} + +/** + * Returns IDs of nodes that contain a link which the current user may be allowed to view. + * + * Important note: For performance reasons, this function is not always + * guaranteed to return the exact list of node IDs that the current user is + * allowed to view. It will, however, always return an empty array if the user + * does not have access to view *any* such nodes, thereby meeting the security + * goals of _linkchecker_link_access() and other places that call it. + * + * In the case where a user has access to some of the nodes that contain the + * link, this function may return some node IDs that the user does not have + * access to. Therefore, use caution with its results. + * + * @param $link + * An object representing the link to check. + * @param $node_author_account + * (optional) If a user account object is provided, the returned nodes will + * additionally be restricted to only those owned by this account. Otherwise, + * nodes owned by any user account may be returned. + * @return + * An array of node IDs that contain the provided link and that the current + * user may be allowed to view. + */ +function _linkchecker_link_node_ids($link, $node_author_account = NULL) { + static $fields_with_node_links = array(); + + // Exit if all node types are disabled or if the user cannot access content, + // there is no need to check further. + $linkchecker_scan_nodetypes = array_filter(variable_get('linkchecker_scan_nodetypes', array())); + if (empty($linkchecker_scan_nodetypes) || !user_access('access content')) { + return array(); + } + + // Disable language negotiation temporarily, re-enable it later. + if (module_exists('i18n')) { + i18n_selection_mode('off'); + } + + // Get a list of nodes containing the link, using db_rewrite_sql() to allow + // node access modules to exclude nodes that the current user does not have + // access to view. + if (!empty($node_author_account)) { + $nodes = db_query(db_rewrite_sql('SELECT n.nid + FROM {node} n + INNER JOIN {linkchecker_nodes} ln ON ln.nid = n.nid + INNER JOIN {node_revisions} r ON r.vid = n.vid + WHERE ln.lid = %d AND (n.uid = %d OR r.uid = %d)'), $link->lid, $node_author_account->uid, $node_author_account->uid); + } + else { + $nodes = db_query(db_rewrite_sql('SELECT n.nid + FROM {node} n + INNER JOIN {linkchecker_nodes} ln ON ln.nid = n.nid + WHERE ln.lid = %d'), $link->lid); + } + + // Re-enable language negotiation. + if (module_exists('i18n')) { + i18n_selection_mode('reset'); + } + + // Check if the current user has access to view the link in each node. + // However, for performance reasons, as soon as we find one node where that + // is the case, stop checking and return the remainder of the list. + $nids = array(); + $access_allowed = FALSE; + while ($node = db_fetch_object($nodes)) { + if ($access_allowed) { + $nids[] = $node->nid; + continue; + } + $node = node_load($node->nid); + // We must check whether the link is currently part of the node; if not, we + // do not want to return it (and it is not safe to, since we cannot know if + // it contained access restrictions for the current user at the point which + // it was originally extracted by the Link checker module). + if (!isset($fields_with_node_links[$node->nid])) { + $fields_with_node_links[$node->nid] = _linkchecker_extract_node_links($node, TRUE); + } + if (empty($fields_with_node_links[$node->nid][$link->url])) { + continue; + } + // If the link only appears in CCK fields and a field access module is + // being used, we must check that the current user has access to view at + // least one field that contains the link; if they don't, we should not + // return the node. + $fields = $fields_with_node_links[$node->nid][$link->url]; + if (!in_array('node', $fields) && module_exists('content') && module_implements('field_access')) { + $fields_with_access = array(); + foreach (content_fields(NULL, $node->type) as $field) { + // Only check link and text fields, since those are the only types we + // extract links from. + if (($field['type'] == 'link' || $field['type'] == 'text') && content_access('view', $field, NULL, $node)) { + $fields_with_access[] = $field['field_name']; + } + } + if (!array_intersect($fields, $fields_with_access)) { + continue; + } + } + $nids[] = $node->nid; + $access_allowed = TRUE; + } + + return $nids; +} + +/** + * Returns IDs of comments that contain a link which the current user is allowed to view. + * + * @param $link + * An object representing the link to check. + * @param $comment_author_account + * (optional) If a user account object is provided, the returned comments + * will additionally be restricted to only those owned by this account. + * Otherwise, comments owned by any user account may be returned. + * @return + * An array of comment IDs that contain the provided link and that the + * current user is allowed to view. + */ +function _linkchecker_link_comment_ids($link, $comment_author_account = NULL) { + // Exit if comments are disabled or if the user cannot access comments, there + // is no need to check further. + if (!module_exists('comment') || !variable_get('linkchecker_scan_comments', 0) || !user_access('access comments')) { + return array(); + } + + // Get a list of comments containing the link, using db_rewrite_sql() to + // allow comment access modules to exclude comments that the current user + // does not have access to view. + if (!empty($comment_author_account)) { + $comments = db_query(db_rewrite_sql('SELECT c.cid + FROM {comments} c + INNER JOIN {linkchecker_comments} lc ON lc.cid = c.cid + WHERE lc.lid = %d AND c.uid = %d', 'c', 'cid'), $link->lid, $comment_author_account->uid); + } + else { + $comments = db_query(db_rewrite_sql('SELECT c.cid + FROM {comments} c + INNER JOIN {linkchecker_comments} lc ON lc.cid = c.cid + WHERE lc.lid = %d', 'c', 'cid'), $link->lid); + } + + // Return the array of comment IDs. + $cids = array(); + while ($comment = db_fetch_object($comments)) { + $cids[] = $comment->cid; + } + return $cids; +} + +/** + * Returns IDs of blocks that contain a link which the current user is allowed to view. + * + * @param $link + * An object representing the link to check. + * @return + * An array of custom block IDs that contain the provided link and that the + * current user is allowed to view. + */ +function _linkchecker_link_block_ids($link) { + // Exit if blocks are disabled. + if (!variable_get('linkchecker_scan_blocks', 0)) { + return array(); + } + + // Get the initial list of block IDs. + $boxes = db_query('SELECT bid FROM {linkchecker_boxes} WHERE lid = %d', $link->lid); + $bids = array(); + while ($box = db_fetch_object($boxes)) { + $bids[] = $box->bid; + } + + // If the user can administer blocks, they're able to see all block content. + if (user_access('administer blocks')) { + return $bids; + } + + // Otherwise, only return blocks that this user (or anonymous users) have + // access to. + global $user; + $rids = array_keys($user->roles); + $rids[] = DRUPAL_ANONYMOUS_RID; + $allowed_boxes = db_query("SELECT DISTINCT b.delta + FROM {blocks} b + LEFT JOIN {blocks_roles} r ON b.module = r.module AND b.delta = r.delta + WHERE b.module = 'block' + AND (r.rid IN (". db_placeholders($rids) .") OR r.rid IS NULL)", $rids); + $allowed_bids = array(); + while ($allowed_box = db_fetch_object($allowed_boxes)) { + $allowed_bids[] = $allowed_box->delta; + } + return array_intersect($bids, $allowed_bids); +} + +/** + * Implementation of hook_cron(). + */ +function linkchecker_cron() { + // Remove outdated links no longer in use once per day. + if (time() - variable_get('linkchecker_cleanup_links_last', 0) >= 86400) { + _linkchecker_cleanup_links(); + variable_set('linkchecker_cleanup_links_last', time()); + } + + // Run link checker in a new process, independent of cron. + if (module_exists('httprl') && variable_get('linkchecker_check_library', 'core') == 'httprl') { + // Setup callback options array; call _linkchecker_check_links() in the + // background. + $callback_options = array(array('function' => '_linkchecker_check_links')); + // Queue up the request. + httprl_queue_background_callback($callback_options); + // Execute request. + httprl_send_request(); + + // Exit here so we don't call _linkchecker_check_links() in this process. + return; + } + // Run the link checks the normal way. + _linkchecker_check_links(); +} + +/** + * Run link checks. + */ +function _linkchecker_check_links() { + // Get max_execution_time from configuration, override 0 with 240 seconds. + $max_execution_time = ini_get('max_execution_time') == 0 ? 240 : ini_get('max_execution_time'); + // Make sure we have enough time to validate all of the links. + linkchecker_set_time_limit($max_execution_time); + + // Make sure this is the only process trying to run this function. + if (!lock_acquire(__FUNCTION__, $max_execution_time)) { + watchdog('linkchecker', 'Attempted to re-run link checks while they are already running.', array(), WATCHDOG_WARNING); + return FALSE; + } + + $has_httprl = (module_exists('httprl') && variable_get('linkchecker_check_library', 'core') == 'httprl'); + + // Do not confuse admins with a setting of maximum checkable links per cron + // run and guess that 2 links can be checked per second with 1 thread, what is + // nevertheless uncommon. The max_execution_time can be used to calculate + // a useful value that is higher, but not totally out of scope and limits the + // query resultset to a resonable size. + $linkchecker_check_connections_max = variable_get('linkchecker_check_connections_max', 8); + $check_links_max_per_cron_run = ($has_httprl) ? ($linkchecker_check_connections_max * $max_execution_time) : $max_execution_time; + + $linkchecker_check_links_interval = variable_get('linkchecker_check_links_interval', 2419200); + $linkchecker_check_useragent = variable_get('linkchecker_check_useragent', 'Drupal (+http://drupal.org/)'); + + // Connection limit can be overriden via settings.php. Two connections is the + // limit defined in RFC http://www.ietf.org/rfc/rfc2616.txt. Modern browsers + // are typically using 6-8 connections and no more. Never use more and keep + // in mind that you can overload other people servers. + $linkchecker_check_domain_connections = variable_get('linkchecker_check_domain_connections', 2); + + // Get URLs for checking. + $links = db_query_range("SELECT * FROM {linkchecker_links} WHERE last_checked < %d AND status = %d ORDER BY last_checked, lid ASC", time() - $linkchecker_check_links_interval, 1, 0, $check_links_max_per_cron_run); + $links_remaining = $links->num_rows; + + while ($link = db_fetch_object($links)) { + $headers = array(); + $headers['User-Agent'] = $linkchecker_check_useragent; + + $uri = @parse_url($link->url); + + // URL contains a fragment. + if (in_array($link->method, array('HEAD', 'GET')) && !empty($uri['fragment'])) { + // We need the full content and not only the HEAD. + $link->method = 'GET'; + // Request text content only (like Firefox/Chrome). + $headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'; + } + elseif ($link->method == 'GET') { + // Range: Only request the first 1024 bytes from remote server. This is + // required to prevent timeouts on URLs that are large downloads. + $headers['Range'] = 'bytes=0-1024'; + } + + // Add in the headers. + $options = array( + 'headers' => $headers, + 'method' => $link->method, + 'max_redirects' => 0, + ); + + if ($has_httprl) { + // Define the callback and add the $link object to it. + // Notes: + // - 'global_timeout' does not require a timer_read('page'), as this job + // runs in a new process, independent of cron. + $options += array( + 'global_connections' => $linkchecker_check_connections_max, + 'global_timeout' => $max_execution_time - 30, + 'domain_connections' => $linkchecker_check_domain_connections, + 'callback' => array( + array( + 'function' => '_linkchecker_status_handling', + ), + $link, // This need to be passed or it's not send back to _linkchecker_status_handling() + ) + ); + // Queue up the requests. + httprl_request($link->url, $options); + $links_remaining--; + + // After all links are queued, run the url checks. + if ($links_remaining == 0) { + httprl_send_request(); + } + } + else { + // Drupal core + $response = drupal_http_request($link->url, $options['headers'], $options['method'], NULL, $options['max_redirects']); + + // Add 'redirect_code' property to core response object for consistency + // with HTTPRL object. + if ($response->code == 301 && !isset($response->redirect_code)) { + $response->redirect_code = $response->code; + } + // Add 'uri' property to core response object for 'fragment' check and + // consistency with HTTPRL object. + $response->uri = $uri; + + _linkchecker_status_handling($response, $link); + + if ((timer_read('page') / 1000) > ($max_execution_time / 2)) { + break; // Stop once we have used over half of the maximum execution time. + } + } + } + + // Release the lock. + lock_release(__FUNCTION__); + watchdog('linkchecker', 'Link checks completed.', array(), WATCHDOG_INFO); + // Peak memory usage is only available in PHP >= 5.2. + if (version_compare(phpversion(), '5.2.0', '>=')) { + watchdog('linkchecker', 'Memory usage: @memory_get_usage, Peak memory usage: @memory_get_peak_usage.', array('@memory_get_peak_usage' => format_size(memory_get_peak_usage()), '@memory_get_usage' => format_size(memory_get_usage())), WATCHDOG_DEBUG); + } + else { + watchdog('linkchecker', 'Memory usage: @memory_get_usage.', array('@memory_get_usage' => format_size(memory_get_usage())), WATCHDOG_DEBUG); + } + return TRUE; +} + +/** + * Status code handling. + * + * @param object $response + * An object containing the HTTP request headers, response code, headers, + * data and redirect status. + * @param string $link + * An object containing the url, lid and fail_count. + */ +function _linkchecker_status_handling(&$response, $link) { + $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403")); + + // - Prevent E_ALL warnings in DB updates for non-existing $response->error. + // - @todo drupal_http_request() may not provide an UTF8 encoded error message + // what results in a database UPDATE failure. For more information, see + // http://drupal.org/node/371495. + // Workaround: ISO-8859-1 as source encoding may be wrong, but WFM. + if (!isset($response->error)) { $response->error = ''; } + if (!isset($response->status_message)) { $response->status_message = ''; } + $response->error = trim(drupal_convert_to_utf8($response->error, 'ISO-8859-1')); + $response->status_message = trim(drupal_convert_to_utf8($response->status_message, 'ISO-8859-1')); + + // Destination anchors in HTML documents may be specified either by the A + // element (naming it with the name attribute), or by any other element + // (naming with the id attribute). + // See http://www.w3.org/TR/html401/struct/links.html + if ($response->code == 200 + && !empty($response->data) + && !empty($response->headers['content-type']) + && !empty($response->uri['fragment']) + && in_array($response->headers['content-type'], array('text/html', 'application/xhtml+xml', 'application/xml')) + && !preg_match('/(\s[^>]*(name|id)(\s+)?=(\s+)?["\'])(' . preg_quote($response->uri['fragment'], '/') . ')(["\'][^>]*>)/i', $response->data) + ) { + // Override status code 200 with status code 404 so it can be handled with + // default status code 404 logic and custom error text. + $response->code = 404; + $response->status_message = $response->error = 'URL fragment identifier not found in content'; + } + + switch ($response->code) { + case -4: // HTTPRL: httprl_send_request timed out. + // Skip these and try them again next cron run. + break; + + case -2: // HTTPRL: maximum allowed redirects exhausted. + case 301: + db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", $response->redirect_code, $response->status_message, time(), $link->lid); + + // A HTTP status code of 301 tells us an existing link have changed to + // a new link. The remote site owner was so kind to provide us the new + // link and if we trust this change we are able to replace the old link + // with the new one without any hand work. + $auto_repair_301 = variable_get('linkchecker_action_status_code_301', 0); + if ($auto_repair_301 && $auto_repair_301 <= ($link->fail_count+1) && valid_url($response->redirect_url, TRUE)) { + // Switch anonymous user to an admin. + linkchecker_impersonate_user(user_load(array('name' => variable_get('linkchecker_impersonate_user', '')))); + + // NODES: Autorepair all nodes having this outdated link. + $res = db_query("SELECT * FROM {linkchecker_nodes} WHERE lid = %d", $link->lid); + while ($row = db_fetch_object($res)) { + $node = node_load(array('nid' => $row->nid)); + + // Has the node object loaded successfully? + if (is_object($node)) { + $node_original = drupal_clone($node); + + // Create array of node fields to scan (for e.g. $node->title, $node->links_weblink_url). + $text_items = array(); + $text_items[] = 'title'; + $text_items[] = 'body'; + $text_items[] = 'teaser'; + + // Update 'weblink' nodes from 'links' module package. + if (module_exists('links_weblink') && $node->type == 'weblink' && isset($node->links_weblink_url)) { + $text_items[] = 'links_weblink_url'; + } + + // Update 'weblinks' nodes from 'weblinks' module. + if (module_exists('weblinks') && $node->type == 'weblinks' && isset($node->url)) { + $text_items[] = 'url'; + } + + // Now replace the outdated link with the permanently moved one in all node fields. + foreach ($text_items as $text_item) { + _linkchecker_link_replace($node->$text_item, $link->url, $response->redirect_url); + } + + // Search for CCK-fields of types 'link' and 'text'. + if (module_exists('content')) { + $fields = content_fields(NULL, $node->type); + foreach ($fields as $field) { + if (isset($node->{$field['field_name']})) { + if (module_exists('link') && $field['type'] == 'link') { + foreach ($node->$field['field_name'] as $delta => $item) { + _linkchecker_link_replace($node->{$field['field_name']}[$delta]['url'], $link->url, $response->redirect_url); + } + } + elseif (module_exists('text') && $field['type'] == 'text') { + foreach ($node->$field['field_name'] as $delta => $item) { + _linkchecker_link_replace($node->{$field['field_name']}[$delta]['value'], $link->url, $response->redirect_url); + } + } + } + } + } + + if ($node_original != $node) { + // Always use the default revision setting. See node_object_prepare(). + $node_options = variable_get('node_options_'. $node->type, array('status', 'promote')); + $node->revision = in_array('revision', $node_options); + + // Generate a log message for the node_revisions table, visible on the node's revisions tab. + $node->log = t('Changed permanently moved link in %node from %src to %dst.', array('%node' => url('node/' . $row->nid), '%src' => $link->url, '%dst' => $response->redirect_url)); + + // Save changed node and update the node link list. + node_save($node); + watchdog('linkchecker', 'Changed permanently moved link in %node from %src to %dst.', array('%node' => url('node/' . $row->nid), '%src' => $link->url, '%dst' => $response->redirect_url), WATCHDOG_INFO); + } + else { + watchdog('linkchecker', 'Link update in node failed. Permanently moved link %src not found in node %node. Manual fix required.', array('%node' => url('node/' . $row->nid), '%src' => $link->url), WATCHDOG_WARNING); + } + } + else { + watchdog('linkchecker', 'Loading node %node for update failed. Manual fix required.', array('%node' => $row->nid), WATCHDOG_ERROR); + } + } + + // COMMENTS: Autorepair all comments having this outdated link. + $res = db_query("SELECT * FROM {linkchecker_comments} WHERE lid = %d", $link->lid); + while ($row = db_fetch_object($res)) { + $comment = _linkchecker_comment_load($row->cid); + + // Has the custom comment array loaded successfully? + if (!empty($comment)) { + $comment_original = $comment; + + // Create array of comment fields to scan (for e.g. $comment->subject, $comment->comment). + $text_items = array(); + $text_items[] = 'subject'; + $text_items[] = 'comment'; + + // Now replace the outdated link with the permanently moved one in all comment fields. + foreach ($text_items as $text_item) { + _linkchecker_link_replace($comment[$text_item], $link->url, $response->redirect_url); + } + + // Save changed comment and update the comment link list. + $comment_diff = array_diff($comment, $comment_original); + if (!empty($comment_diff)) { + comment_save($comment); + watchdog('linkchecker', 'Changed permanently moved link in comment %comment from %src to %dst.', array('%comment' => $comment['cid'], '%src' => $link->url, '%dst' => $response->redirect_url), WATCHDOG_INFO); + } + else { + watchdog('linkchecker', 'Link update in comment failed. Permanently moved link %src not found in comment %comment. Manual fix required.', array('%comment' => $comment['cid'], '%src' => $link->url), WATCHDOG_WARNING); + } + } + else { + watchdog('linkchecker', 'Loading comment %comment for update failed. Manual fix required.', array('%comment' => $comment['cid']), WATCHDOG_ERROR); + } + } + + // BOXES: Autorepair all boxes having this outdated link. + $res = db_query("SELECT * FROM {linkchecker_boxes} WHERE lid = %d", $link->lid); + while ($row = db_fetch_object($res)) { + $box = block_box_get($row->bid); + + // Has the custom block array loaded successfully? + if (!empty($box)) { + $box_original = $box; + + // Create array of box fields to scan. + $text_items = array(); + $text_items[] = 'info'; + $text_items[] = 'body'; + + // Now replace the outdated link with the permanently moved one in all + // box fields. + foreach ($text_items as $text_item) { + _linkchecker_link_replace($box[$text_item], $link->url, $response->redirect_url); + } + + $box_diff = array_diff($box, $box_original); + if (!empty($box_diff)) { + // Save changed box and update the box link list. + block_box_save($box, $row->bid); + // There is no hook that fires on block_box_save(), therefore do link + // extraction programmatically. + _linkchecker_add_box_links($box, $row->bid); + watchdog('linkchecker', 'Changed permanently moved link in box %bid from %src to %dst.', array('%bid' => $row->bid, '%src' => $link->url, '%dst' => $response->redirect_url), WATCHDOG_INFO); + } + else { + watchdog('linkchecker', 'Link update in block failed. Permanently moved link %src not found in block %bid. Manual fix required.', array('%bid' => $row->bid, '%src' => $link->url), WATCHDOG_WARNING); + } + } + else { + watchdog('linkchecker', 'Loading block %bid for update failed. Manual fix required.', array('%bid' => $row->bid), WATCHDOG_ERROR); + } + } + + // Revert user back to anonymous. + linkchecker_revert_user(); + } + else { + watchdog('linkchecker', 'Link %link has changed and needs to be updated.', array('%link' => $link->url), WATCHDOG_NOTICE, l(t('Broken links'), 'admin/reports/linkchecker')); + } + break; + + case 404: + db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", $response->code, $response->error, time(), $link->lid); + watchdog('linkchecker', 'Broken link %link has been found.', array('%link' => $link->url), WATCHDOG_NOTICE, l(t('Broken links'), 'admin/reports/linkchecker')); + + // If unpublishing limit is reached, unpublish all nodes having this link. + $linkchecker_action_status_code_404 = variable_get('linkchecker_action_status_code_404', 0); + if ($linkchecker_action_status_code_404 && $linkchecker_action_status_code_404 <= ($link->fail_count+1)) { + // Switch anonymous user to an admin. + linkchecker_impersonate_user(user_load(array('name' => variable_get('linkchecker_impersonate_user', '')))); + _linkchecker_unpublish_nodes($link->lid); + linkchecker_revert_user(); + } + break; + + case 405: + // - 405: Special error handling if method is not allowed. Switch link + // checking to GET method and try again. + db_query("UPDATE {linkchecker_links} SET method = '%s', code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", 'GET', $response->code, $response->error, time(), $link->lid); + watchdog('linkchecker', 'Method HEAD is not allowed for link %link. Method has been changed to GET.', array('%link' => $link->url), WATCHDOG_INFO, l(t('Broken links'), 'admin/reports/linkchecker')); + break; + + case 500: + // - 500: Like WGET, try with GET on "500 Internal server error". + // - If GET also fails with status code 500, than the link is broken. + if ($link->method == 'GET' && $response->code == 500) { + db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", $response->code, $response->error, time(), $link->lid); + watchdog('linkchecker', 'Broken link %link has been found.', array('%link' => $link->url), WATCHDOG_NOTICE, l(t('Broken links'), 'admin/reports/linkchecker')); + } + else { + db_query("UPDATE {linkchecker_links} SET method = '%s', code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", 'GET', $response->code, $response->error, time(), $link->lid); + watchdog('linkchecker', 'Internal server error for link %link. Method has been changed to GET.', array('%link' => $link->url), WATCHDOG_INFO, l(t('Broken links'), 'admin/reports/linkchecker')); + } + break; + + default: + // Don't treat ignored response codes as errors. + if (in_array($response->code, $ignore_response_codes)) { + db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = %d, last_checked = %d WHERE lid = %d", $response->code, $response->error, 0, time(), $link->lid); + //watchdog('linkchecker', 'Unhandled link error %link has been found.', array('%link' => $link->url), WATCHDOG_ERROR, l(t('Broken links'), 'admin/reports/linkchecker')); + } + else { + db_query("UPDATE {linkchecker_links} SET code = %d, error = '%s', fail_count = fail_count+1, last_checked = %d WHERE lid = %d", $response->code, $response->error, time(), $link->lid); + //watchdog('linkchecker', 'Unhandled link error %link has been found.', array('%link' => $link->url), WATCHDOG_ERROR, l(t('Broken links'), 'admin/reports/linkchecker')); + } + } + + // Free Memory. + $response = new stdClass(); +} + +function linkchecker_nodeapi(&$node, $op, $a3 = NULL, $a4 = NULL) { + switch ($op) { + case 'insert': + // The node is going to be published. + if (_linkchecker_scan_nodetype($node->type) && $node->status) { + _linkchecker_add_node_links($node); + } + break; + + case 'update': + // The node is going to be published. + if (_linkchecker_scan_nodetype($node->type) && $node->status) { + _linkchecker_add_node_links($node); + } + else { + // The node is going to be unpublished. + _linkchecker_delete_node_links($node->nid); + } + break; + + case 'delete': + _linkchecker_delete_node_links($node->nid); + break; + + case 'prepare': + // Node edit tab is viewed. + if (arg(0) == 'node' && is_numeric(arg(1)) && arg(2) == 'edit' && isset($node->nid)) { + // Show a message on node edit page if a link check failed once or more. + $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403")); + $links = db_query("SELECT ll.* FROM {linkchecker_nodes} ln INNER JOIN {linkchecker_links} ll ON ln.lid = ll.lid WHERE ln.nid = %d AND ll.fail_count > %d AND ll.status = %d AND ll.code NOT IN (" . db_placeholders($ignore_response_codes, 'int') . ")", array_merge(array($node->nid, 0, 1), $ignore_response_codes)); + while ($link = db_fetch_object($links)) { + if (_linkchecker_link_access($link)) { + drupal_set_message(format_plural($link->fail_count, 'Link check of @url failed once (status code: @code).', 'Link check of @url failed @count times (status code: @code).', array('@url' => $link->url, '@code' => $link->code)), 'warning', FALSE); + } + } + } + break; + } +} + +function linkchecker_comment($comment, $op) { + // Convert $comment object (admin/content/comment) to array (comment/edit/[cid]). + $comment = (array) $comment; + + switch ($op) { + case 'publish': + $node_type = db_result(db_query("SELECT type FROM {node} WHERE nid = %d", $comment['nid'])); + if (_linkchecker_scan_nodetype($node_type) && variable_get('linkchecker_scan_comments', 0)) { + _linkchecker_add_comment_links($comment); + } + break; + + case 'unpublish': + case 'delete': + _linkchecker_delete_comment_links($comment['cid']); + break; + } +} + +function linkchecker_form_alter(&$form, $form_state, $form_id) { + switch ($form_id) { + // Catch the block add/configure form and add custom submit handler. + case 'block_add_block_form': + // Add custom submit handler to block add form. + $form['#submit'][] = 'linkchecker_block_add_form_submit'; + break; + + case 'block_admin_configure': + // When displaying the form, show the broken links warning. + if (empty($form_state['post']) && is_numeric(arg(5))) { + // Show a message on block edit page if a link check failed once or more. + $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403")); + $links = db_query("SELECT ll.* FROM {linkchecker_boxes} lb INNER JOIN {linkchecker_links} ll ON lb.lid = ll.lid WHERE lb.bid = %d AND ll.fail_count > %d AND ll.status = %d AND ll.code NOT IN (" . db_placeholders($ignore_response_codes, 'int') . ")", array_merge(array(arg(5), 0, 1), $ignore_response_codes)); + while ($link = db_fetch_object($links)) { + if (_linkchecker_link_access($link)) { + drupal_set_message(format_plural($link->fail_count, 'Link check of @url failed once (status code: @code).', 'Link check of @url failed @count times (status code: @code).', array('@url' => $link->url, '@code' => $link->code)), 'warning', FALSE); + } + } + } + + // Add custom submit handler to block configuration form. + $form['#submit'][] = 'linkchecker_block_configure_form_submit'; + break; + + case 'block_box_delete': + $form['#submit'][] = 'linkchecker_block_box_delete_form_submit'; + break; + + case 'comment_form': + // When displaying the form as 'view' or 'preview', show the broken links warning. + if ((empty($form_state['post']) || isset($form_state['post']['op']) && $form_state['post']['op'] == t('Preview')) && arg(0) == 'comment' && arg(1) == 'edit' && is_numeric(arg(2))) { + // Show a message on comment edit page if a link check failed once or more. + $ignore_response_codes = preg_split('/(\r\n?|\n)/', variable_get('linkchecker_ignore_response_codes', "200\n206\n302\n304\n401\n403")); + $links = db_query("SELECT ll.* FROM {linkchecker_comments} lc INNER JOIN {linkchecker_links} ll ON lc.lid = ll.lid WHERE lc.cid = %d AND ll.fail_count > %d AND ll.status = %d AND ll.code NOT IN (" . db_placeholders($ignore_response_codes, 'int') . ")", array_merge(array(arg(2), 0, 1), $ignore_response_codes)); + while ($link = db_fetch_object($links)) { + if (_linkchecker_link_access($link)) { + drupal_set_message(format_plural($link->fail_count, 'Link check of @url failed once (status code: @code).', 'Link check of @url failed @count times (status code: @code).', array('@url' => $link->url, '@code' => $link->code)), 'warning', FALSE); + } + } + } + break; + } +} + +/** + * Custom submit handler for block add page. + */ +function linkchecker_block_add_form_submit($form, &$form_state) { + if (variable_get('linkchecker_scan_blocks', 0)) { + $bid = db_result(db_query("SELECT MAX(bid) FROM {boxes}")); + _linkchecker_add_box_links($form_state['values'], $bid); + } +} + +/** + * Custom submit handler for block configure page. + */ +function linkchecker_block_configure_form_submit($form, &$form_state) { + if (variable_get('linkchecker_scan_blocks', 0)) { + _linkchecker_add_box_links($form_state['values'], $form_state['values']['delta']); + } +} + +/** + * Custom submit handler for block delete page. + */ +function linkchecker_block_box_delete_form_submit($form, &$form_state) { + _linkchecker_delete_box_links($form_state['values']['bid']); +} + +/** + * Extracts links from a node. + * + * @param $node + * The fully populated node object. + * @param $return_field_names + * If set to TRUE, the returned array will contain the link URLs as keys, and + * each element will be an array containing all field names in which the URL + * is found (the special field name "node" is used to represent all scanned + * node content that is not a CCK field). Otherwise, a simple array of URLs + * will be returned. + * @return + * An array whose keys are fully qualified and unique URLs found in the node + * (as returned by _linkchecker_extract_links()), or a more complex + * structured array (see above) if $return_field_names is TRUE. + */ +function _linkchecker_extract_node_links($node, $return_field_names = FALSE) { + // Get current node language options for url() functions. + $languages = language_list(); + $url_options = empty($node->language) ? array('absolute' => TRUE) : array('language' => $languages[$node->language], 'absolute' => TRUE); + + // Create array of node fields to scan. + $text_items = array(); + $text_items_by_field = array(); + $text_items[] = $text_items_by_field['node'][] = _filter_url($node->title, $node->format); + $text_items[] = $text_items_by_field['node'][] = _linkchecker_check_markup($node->body, $node->format, FALSE); + $text_items[] = $text_items_by_field['node'][] = _linkchecker_check_markup($node->teaser, $node->format, FALSE); + + // Search for links in 'weblink' nodes from 'links' module package. + if (module_exists('links_weblink') && $node->type == 'weblink' && !empty($node->links_weblink_url)) { + $text_items[] = $text_items_by_field['node'][] = _filter_url(url($node->links_weblink_url, $url_options), $node->format); + } + + // Search for links in 'weblinks' nodes from 'weblinks' module. + if (module_exists('weblinks') && $node->type == 'weblinks' && !empty($node->url)) { + $text_items[] = $text_items_by_field['node'][] = _filter_url(url($node->url, $url_options), $node->format); + } + + // Search for CCK-fields of types 'link' and 'text'. + if (module_exists('content')) { + $fields = content_fields(NULL, $node->type); + foreach ($fields as $field) { + if (!empty($node->{$field['field_name']})) { + if (module_exists('link') && $field['type'] == 'link') { + foreach ($node->$field['field_name'] as $delta => $item) { + if (!empty($item['url'])) { + // Make non-absolute urls absolute or they are not found by _filter_url(). + $text_items[] = $text_items_by_field[$field['field_name']][] = _filter_url(url($item['url'], $url_options), $node->format); + } + } + } + elseif (module_exists('text') && $field['type'] == 'text') { + foreach ($node->$field['field_name'] as $delta => $item) { + $text_items[] = $text_items_by_field[$field['field_name']][] = _filter_url($item['value'], $node->format); + } + } + } + } + } + + // Get the absolute node path for extraction of relative links. + $path = url('node/'. $node->nid, $url_options); + + // Extract all links in a node. + $links = _linkchecker_extract_links(implode(' ', $text_items), $path); + + // Return either the array of links, or an array of field names containing + // each link, depending on what was requested. + if (!$return_field_names) { + return $links; + } + else { + $field_names = array(); + foreach ($text_items_by_field as $field_name => $items) { + foreach ($items as $item) { + foreach ($links as $uri => $link) { + // We only need to do a quick check here to see if the URL appears + // anywhere in the text; if so, that means users with access to this + // field will be able to see the URL (and any private data such as + // passwords contained in it). This is sufficient for the purposes of + // _linkchecker_link_node_ids(), where this information is used. + foreach ($link as $original_link) { + if (strpos($item, $original_link) !== FALSE) { + $field_names[$uri][$field_name] = $field_name; + } + } + } + } + } + return $field_names; + } +} + +/** + * Add node links to database. + * + * @param $node + * The fully populated node object. + * @param $skip_missing_links_detection + * To prevent endless batch loops the value need to be TRUE. With FALSE + * the need for content re-scans is detected by the number of missing links. + */ +function _linkchecker_add_node_links($node, $skip_missing_links_detection = FALSE) { + $links = array_keys(_linkchecker_extract_node_links($node)); + + // Node have links. + if (!empty($links)) { + // Remove all links from the links array already in the database + // and only add missing links to database. + $missing_links = _linkchecker_node_links_missing($node->nid, $links); + + // Only add links to database that do not exists. + $i = 0; + foreach ($missing_links as $url) { + $urlhash = md5($url); + $link = db_fetch_object(db_query("SELECT lid FROM {linkchecker_links} WHERE urlhash = '%s'", $urlhash)); + if (!$link) { + $link = new stdClass(); + $link->urlhash = $urlhash; + $link->url = $url; + $link->status = _linkchecker_link_check_status_filter($url); + drupal_write_record('linkchecker_links', $link); + } + db_query("INSERT INTO {linkchecker_nodes} (nid, lid) VALUES (%d, %d)", $node->nid, $link->lid); + + // Break processing if max links limit per run has been reached. + $i++; + if ($i >= LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) { + break; + } + } + + // The first chunk of links not yet found in the {linkchecker_links} table + // have now been imported by the above code. If the number of missing links + // still exceeds the scan limit defined in LINKCHECKER_SCAN_MAX_LINKS_PER_RUN + // the content need to be re-scanned until all links have been collected and + // saved in {linkchecker_links} table. + // + // Above code has already scanned a number of LINKCHECKER_SCAN_MAX_LINKS_PER_RUN + // links and need to be substracted from the number of missing links to + // calculate the correct number of re-scan rounds. + // + // To prevent endless loops the $skip_missing_links_detection need to be TRUE. + // This value will be set by the calling batch process that already knows + // that it is running a batch job and the number of required re-scan rounds. + $missing_links_count = count($missing_links) - LINKCHECKER_SCAN_MAX_LINKS_PER_RUN; + if (!$skip_missing_links_detection && $missing_links_count > 0) { + module_load_include('inc', 'linkchecker', '/includes/linkchecker.batch'); + batch_set(_linkchecker_batch_import_single_node($node->nid, $missing_links_count)); + + // If batches were set in the submit handlers, we process them now, + // possibly ending execution. We make sure we do not react to the batch + // that is already being processed (if a batch operation performs a + // drupal_execute). + if ($batch = &batch_get() && !isset($batch['current_set'])) { + batch_process('node/' . $node->nid); + } + } + } + + // Remove dead link references for cleanup reasons as very last step. + _linkchecker_cleanup_node_references($node->nid, $links); +} + +/** + * Add comment links to database. + * + * @param $comment + * The fully populated comment array. + * @param $skip_missing_links_detection + * To prevent endless batch loops the value need to be TRUE. With FALSE + * the need for content re-scans is detected by the number of missing links. + */ +function _linkchecker_add_comment_links($comment, $skip_missing_links_detection = FALSE) { + // Create array of comment fields to scan. + $text_items = array(); + $text_items[] = _filter_url($comment['subject'], $comment['format']); + $text_items[] = _linkchecker_check_markup($comment['comment'], $comment['format'], FALSE); + + // Get the absolute node path for extraction of relative links. + $languages = language_list(); + $node_language = db_result(db_query("SELECT language FROM {node} WHERE nid = %d", $comment['nid'])); + $path = url('node/'. $comment['nid'], array('language' => $languages[$node_language], 'absolute' => TRUE)); + + // Extract all links in a comment. + $links = array_keys(_linkchecker_extract_links(implode(' ', $text_items), $path)); + + // Comment have links. + if (!empty($links)) { + // Remove all links from the links array already in the database + // and only add missing links to database. + $missing_links = _linkchecker_comment_links_missing($comment['cid'], $links); + + // Only add unique links to database that do not exist. + $i = 0; + foreach ($missing_links as $url) { + $urlhash = md5($url); + $link = db_fetch_object(db_query("SELECT lid FROM {linkchecker_links} WHERE urlhash = '%s'", $urlhash)); + if (!$link) { + $link = new stdClass(); + $link->urlhash = $urlhash; + $link->url = $url; + $link->status = _linkchecker_link_check_status_filter($url); + drupal_write_record('linkchecker_links', $link); + } + db_query("INSERT INTO {linkchecker_comments} (cid, lid) VALUES (%d, %d)", $comment['cid'], $link->lid); + + // Break processing if max links limit per run has been reached. + $i++; + if ($i >= LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) { + break; + } + } + + // The first chunk of links not yet found in the {linkchecker_links} table + // have now been imported by the above code. If the number of missing links + // still exceeds the scan limit defined in LINKCHECKER_SCAN_MAX_LINKS_PER_RUN + // the content need to be re-scanned until all links have been collected and + // saved in {linkchecker_links} table. + // + // Above code has already scanned a number of LINKCHECKER_SCAN_MAX_LINKS_PER_RUN + // links and need to be substracted from the number of missing links to + // calculate the correct number of re-scan rounds. + // + // To prevent endless loops the $skip_missing_links_detection need to be TRUE. + // This value will be set by the calling batch process that already knows + // that it is running a batch job and the number of required re-scan rounds. + $missing_links_count = count($missing_links) - LINKCHECKER_SCAN_MAX_LINKS_PER_RUN; + if (!$skip_missing_links_detection && $missing_links_count > 0) { + module_load_include('inc', 'linkchecker', '/includes/linkchecker.batch'); + batch_set(_linkchecker_batch_import_single_comment($comment['cid'], $missing_links_count)); + + // If batches were set in the submit handlers, we process them now, + // possibly ending execution. We make sure we do not react to the batch + // that is already being processed (if a batch operation performs a + // drupal_execute). + if ($batch = &batch_get() && !isset($batch['current_set'])) { + batch_process('node/' . $comment['nid']); + } + } + } + + // Remove dead link references for cleanup reasons as very last step. + _linkchecker_cleanup_comment_references($comment['cid'], $links); +} + +/** + * Add block links to database. + * + * @param array $box + * The fully populated block array. + * @param integer $bid + * Block id from table {blocks}.bid. + * @param $skip_missing_links_detection + * To prevent endless batch loops the value need to be TRUE. With FALSE + * the need for content re-scans is detected by the number of missing links. + */ +function _linkchecker_add_box_links($box, $bid, $skip_missing_links_detection = FALSE) { + // Create array of box fields to scan. + $text_items = array(); + $text_items[] = _filter_url($box['info'], $box['format']); + $text_items[] = _linkchecker_check_markup($box['body'], $box['format'], FALSE); + + // Extract all links in a box. + $links = array_keys(_linkchecker_extract_links(implode(' ', $text_items))); + + // Box has links. + if (!empty($links)) { + // Remove all links from the links array already in the database + // and only add missing links to database. + $missing_links = _linkchecker_box_links_missing($bid, $links); + + // Only add unique links to database that do not exist. + $i = 0; + foreach ($missing_links as $url) { + $urlhash = md5($url); + $link = db_fetch_object(db_query("SELECT lid FROM {linkchecker_links} WHERE urlhash = '%s'", $urlhash)); + if (!$link) { + $link = new stdClass(); + $link->urlhash = $urlhash; + $link->url = $url; + $link->status = _linkchecker_link_check_status_filter($url); + drupal_write_record('linkchecker_links', $link); + } + db_query("INSERT INTO {linkchecker_boxes} (bid, lid) VALUES (%d, %d)", $bid, $link->lid); + + // Break processing if max links limit per run has been reached. + $i++; + if ($i >= LINKCHECKER_SCAN_MAX_LINKS_PER_RUN) { + break; + } + } + + // The first chunk of links not yet found in the {linkchecker_links} table + // have now been imported by the above code. If the number of missing links + // still exceeds the scan limit defined in LINKCHECKER_SCAN_MAX_LINKS_PER_RUN + // the content need to be re-scanned until all links have been collected and + // saved in {linkchecker_links} table. + // + // Above code has already scanned a number of LINKCHECKER_SCAN_MAX_LINKS_PER_RUN + // links and need to be substracted from the number of missing links to + // calculate the correct number of re-scan rounds. + // + // To prevent endless loops the $skip_missing_links_detection need to be TRUE. + // This value will be set by the calling batch process that already knows + // that it is running a batch job and the number of required re-scan rounds. + $missing_links_count = count($missing_links) - LINKCHECKER_SCAN_MAX_LINKS_PER_RUN; + if (!$skip_missing_links_detection && $missing_links_count > 0) { + module_load_include('inc', 'linkchecker', '/includes/linkchecker.batch'); + batch_set(_linkchecker_batch_import_single_box($bid, $missing_links_count)); + + // If batches were set in the submit handlers, we process them now, + // possibly ending execution. We make sure we do not react to the batch + // that is already being processed (if a batch operation performs a + // drupal_execute). + if ($batch = &batch_get() && !isset($batch['current_set'])) { + batch_process('admin/build/block'); + } + } + } + + // Remove dead link references for cleanup reasons as very last step. + _linkchecker_cleanup_box_references($bid, $links); +} + +/** + * Remove all node references to links in the linkchecker_nodes table. + */ +function _linkchecker_delete_node_links($nid) { + return db_query("DELETE FROM {linkchecker_nodes} WHERE nid = %d", $nid); +} + +/** + * Remove all comment references to links in the linkchecker_comments table. + */ +function _linkchecker_delete_comment_links($cid) { + return db_query("DELETE FROM {linkchecker_comments} WHERE cid = %d", $cid); +} + +/** + * Remove all box references to links in the linkchecker_boxes table. + */ +function _linkchecker_delete_box_links($bid) { + return db_query("DELETE FROM {linkchecker_boxes} WHERE bid = %d", $bid); +} + +/** + * Cleanup no longer used node references to links in the linkchecker_nodes table. + */ +function _linkchecker_cleanup_node_references($nid = 0, $links = array()) { + if (empty($links)) { + // Node do not have links. Delete all references if exists. + db_query("DELETE FROM {linkchecker_nodes} WHERE nid = %d", $nid); + } + else { + // The node still have more than one link, but other links may have been + // removed and links no longer in the content need to be deleted from the + // linkchecker_nodes reference table. + db_query("DELETE FROM {linkchecker_nodes} WHERE nid = %d AND lid NOT IN (SELECT lid FROM {linkchecker_links} WHERE urlhash IN (" . db_placeholders($links, 'varchar') . "))", array_merge(array($nid), array_map('md5', $links))); + } +} + +/** + * Cleanup no longer used comment references to links in the linkchecker_comments table. + */ +function _linkchecker_cleanup_comment_references($cid = 0, $links = array()) { + if (empty($links)) { + // Comment do not have links. Delete all references if exists. + db_query("DELETE FROM {linkchecker_comments} WHERE cid = %d", $cid); + } + else { + // The comment still have more than one link, but other links may have been + // removed and links no longer in the content need to be deleted from the + // linkchecker_comments reference table. + db_query("DELETE FROM {linkchecker_comments} WHERE cid = %d AND lid NOT IN (SELECT lid FROM {linkchecker_links} WHERE urlhash IN (" . db_placeholders($links, 'varchar') . "))", array_merge(array($cid), array_map('md5', $links))); + } +} + +/** + * Cleanup no longer used box references to links in the linkchecker_boxes table. + */ +function _linkchecker_cleanup_box_references($bid = 0, $links = array()) { + if (empty($links)) { + // Block do not have links. Delete all references if exists. + db_query("DELETE FROM {linkchecker_boxes} WHERE bid = %d", $bid); + } + else { + // The block still have more than one link, but other links may have been + // removed and links no longer in the content need to be deleted from the + // linkchecker_boxes reference table. + db_query("DELETE FROM {linkchecker_boxes} WHERE bid = %d AND lid NOT IN (SELECT lid FROM {linkchecker_links} WHERE urlhash IN (" . db_placeholders($links, 'varchar') . "))", array_merge(array($bid), array_map('md5', $links))); + } +} + +/** + * Returns an array of node references missing in the linkchecker_nodes table. + */ +function _linkchecker_node_links_missing($nid, $links) { + $res = db_query("SELECT url FROM {linkchecker_links} ll INNER JOIN {linkchecker_nodes} ln ON ll.lid = ln.lid WHERE ln.nid = %d AND urlhash IN (" . db_placeholders($links, 'varchar') . ")", array_merge(array($nid), array_map('md5', $links))); + $links_in_database = array(); + while ($row = db_fetch_object($res)) { + $links_in_database[] = $row->url; + } + return array_diff($links, $links_in_database); +} + +/** + * Returns an array of comment references missing in the linkchecker_comments table. + */ +function _linkchecker_comment_links_missing($cid, $links) { + $res = db_query("SELECT url FROM {linkchecker_links} ll INNER JOIN {linkchecker_comments} lc ON ll.lid = lc.lid WHERE lc.cid = %d AND urlhash IN (" . db_placeholders($links, 'varchar') . ")", array_merge(array($cid), array_map('md5', $links))); + $links_in_database = array(); + while ($row = db_fetch_object($res)) { + $links_in_database[] = $row->url; + } + return array_diff($links, $links_in_database); +} + +/** + * Returns an array of box references missing in the linkchecker_boxes table. + */ +function _linkchecker_box_links_missing($bid, $links) { + $res = db_query("SELECT url FROM {linkchecker_links} ll INNER JOIN {linkchecker_boxes} lb ON ll.lid = lb.lid WHERE lb.bid = %d AND urlhash IN (" . db_placeholders($links, 'varchar') . ")", array_merge(array($bid), array_map('md5', $links))); + $links_in_database = array(); + while ($row = db_fetch_object($res)) { + $links_in_database[] = $row->url; + } + return array_diff($links, $links_in_database); +} + +/** + * Run perodically via cron and delete all links without a references. + * + * For speed reasons and check results we keep the links for some time + * as they may be reused by other new content. + */ +function _linkchecker_cleanup_links() { + // Remove disabled node types no longer in use. + $node_types = array_keys(array_filter(variable_get('linkchecker_scan_nodetypes', array()))); + if (!empty($node_types)) { + db_query('DELETE FROM {linkchecker_nodes} WHERE nid IN (SELECT nid FROM {node} n WHERE n.type NOT IN (' . db_placeholders($node_types, 'varchar') . '))', $node_types); + // FIXME: Remove comments + //db_query('DELETE FROM {linkchecker_comments} WHERE cid IN (SELECT nid FROM {node} n WHERE n.type NOT IN (' . db_placeholders($node_types, 'varchar') . '))', $node_types); + } + else { + db_query('TRUNCATE TABLE {linkchecker_nodes}'); + // FIXME: Remove comments + } + + // Remove comment link references if comment scanning is disabled. + // TODO: Remove comments of unpublished nodes. + if (variable_get('linkchecker_scan_comments', 0) == 0) { + db_query('TRUNCATE TABLE {linkchecker_comments}'); + } + + // Remove block link references if block scanning is disabled. + if (variable_get('linkchecker_scan_blocks', 0) == 0) { + db_query('TRUNCATE TABLE {linkchecker_boxes}'); + } + + // Remove dead links without references. + db_query('DELETE FROM {linkchecker_links} + WHERE lid NOT IN ( + SELECT DISTINCT lid FROM {linkchecker_boxes} + UNION + SELECT DISTINCT lid FROM {linkchecker_comments} + UNION + SELECT DISTINCT lid FROM {linkchecker_nodes} + )'); + +} + +/** + * Extract links from content. + * + * @param string $text + * The text to be scanned for links. + * @param string $content_path + * Path to the content that is currently scanned for links. This value is + * required to build full qualified links from relative links. Relative links + * are not extracted from content, if path is not provided. + * @return array + * Array whose keys are fully qualified and unique URLs found in the + * content, and whose values are arrays of actual text (raw URLs or paths) + * corresponding to each fully qualified URL. + */ +function _linkchecker_extract_links($text = '', $content_path = NULL) { + global $base_root; + + // Finds all hyperlinks in the content. + $matches_a = array(1 => NULL); + if (variable_get('linkchecker_extract_from_a', 1) == 1) { + // Extract all chars in the href value, except double and single quotes. + $pattern_a = '/<(?:a|area)\s[^>]*href=["\']([^"\']*)["\'][^>]*>/i'; + preg_match_all($pattern_a, $text, $matches_a); + } + + // Finds all audio links in the content. + $matches_audio = array(1 => NULL); + if (variable_get('linkchecker_extract_from_audio', 1) == 1) { + $pattern_audio = '/]*src=["\']([^"\']*)["\'][^>]*>/i'; + preg_match_all($pattern_audio, $text, $matches_audio); + } + + // Finds embed tags with links in the content. + $matches_embed = array(); + if (variable_get('linkchecker_extract_from_embed', 0) == 1) { + $pattern_embed_src = '/]*src=["\']([^"\']*)["\'][^>]*>/i'; + $pattern_embed_pluginurl = '/]*pluginurl=["\']([^"\']*)["\'][^>]*>/i'; + $pattern_embed_pluginspage = '/]*pluginspage=["\']([^"\']*)["\'][^>]*>/i'; + + preg_match_all($pattern_embed_src, $text, $matches_embed_src); + preg_match_all($pattern_embed_pluginurl, $text, $matches_embed_pluginurl); + preg_match_all($pattern_embed_pluginspage, $text, $matches_embed_pluginspage); + + $matches_embed = array_merge( + (array)$matches_embed_src[1], + (array)$matches_embed_pluginurl[1], + (array)$matches_embed_pluginspage[1] + ); + } + + // Finds iframe tags with links in the content. + $matches_iframe = array(1 => NULL); + if (variable_get('linkchecker_extract_from_iframe', 0) == 1) { + $pattern_iframe = '/]*src=["\']([^"\']*)["\'][^>]*>/i'; + preg_match_all($pattern_iframe, $text, $matches_iframe); + } + + // Finds img tags with links in the content. + $matches_img = array(1 => NULL); + if (variable_get('linkchecker_extract_from_img', 0) == 1) { + $pattern_img = '/]*src=["\']([^"\']*)["\'][^>]*>/i'; + preg_match_all($pattern_img, $text, $matches_img); + } + + // Finds object/param tags with links in the content. + $matches_object = array(); + if (variable_get('linkchecker_extract_from_object', 0) == 1) { + // TODO's: + // * Allow flipped order of attributes in "param". + // * Try to extract links in unkown "flashvars" values (for e.g. file=http://, data=http://). + $pattern_object_data = '/]*data=["\']([^"\']*)["\'][^>]*>/i'; + $pattern_object_codebase = '/]*codebase=["\']([^"\']*)["\'][^>]*>/i'; + $pattern_param = '/]*((name|src)=["\'](archive|filename|href|movie|src|url)["\']\s[^>]*)+value=["\']([^"\']*)["\'][^>]*>/i'; + + preg_match_all($pattern_object_data, $text, $matches_object_data); + preg_match_all($pattern_object_codebase, $text, $matches_object_codebase); + preg_match_all($pattern_param, $text, $matches_param); + + $matches_object = array_merge( + (array)$matches_object_data[1], + (array)$matches_object_codebase[1], + (array)$matches_param[4] + ); + } + + // Finds source tags with links in the content. + $matches_source = array(1 => NULL); + if (variable_get('linkchecker_extract_from_source', 0) == 1) { + $pattern_source = '/]*src=["\']([^"\']*)["\'][^>]*>/i'; + preg_match_all($pattern_source, $text, $matches_source); + } + + // Finds video tags with links in the content. + $matches_video = array(); + if (variable_get('linkchecker_extract_from_video', 0) == 1) { + $pattern_video_poster = '/]*poster=["\']([^"\']*)["\'][^>]*>/i'; + $pattern_video_src = '/]*src=["\']([^"\']*)["\'][^>]*>/i'; + + preg_match_all($pattern_video_poster, $text, $matches_video_poster); + preg_match_all($pattern_video_src, $text, $matches_video_src); + + $matches_video = array_merge( + (array)$matches_video_poster[1], + (array)$matches_video_src[1] + ); + } + + // Merge all extracted links into one array. + $urls = array_merge( + (array)$matches_a[1], + (array)$matches_audio[1], + (array)$matches_embed, + (array)$matches_iframe[1], + (array)$matches_img[1], + (array)$matches_object, + (array)$matches_source[1], + (array)$matches_video + ); + + // Remove empty values. + $urls = array_filter($urls); + // Remove duplicate urls. + $urls = array_unique($urls); + + // What type of links schould be checked? + $linkchecker_check_links_types = variable_get('linkchecker_fqdn_only', 1); + + $links = array(); + foreach ($urls as $url) { + // Decode HTML links into plain text links. + $url_decoded = decode_entities($url); + + // Prefix protocol relative urls with a protocol to allow link checking. + if (preg_match('!^//!', $url_decoded)) { + $http_protocol = (isset($_SERVER['HTTPS']) && $_SERVER['HTTPS'] == 'on') ? 'https' : 'http'; + $url_decoded = $http_protocol . ':' . $url_decoded; + } + + // FIXME: #1149596 HACK - Encode spaces in URLs, so validation equals TRUE and link gets added. + $url_encoded = str_replace(' ', '%20', $url_decoded); + + // Full qualified URLs. + if ($linkchecker_check_links_types != 2 && valid_url($url_encoded, TRUE)) { + // Add to Array and change HTML links into plain text links. + $links[$url_decoded][] = $url; + } + // Skip mailto:, javascript:, etc. + elseif (preg_match('/^\w[\w.+]*:/', $url_decoded)) { + continue; + } + // Local URLs. + elseif ($linkchecker_check_links_types != 1 && valid_url($url_encoded, FALSE)) { + // Get full qualified url with base path of content. + $absolute_content_path = _linkchecker_absolute_content_path($content_path); + + // Absolute local URLs need to start with [/]. + if (preg_match('!^/!', $url_decoded)) { + // Add to Array and change HTML encoded links into plain text links. + $links[$base_root . $url_decoded][] = $url; + } + // Anchors and URL parameters like "#foo" and "?foo=bar". + elseif (!empty($content_path) && preg_match('!^[?#]!', $url_decoded)) { + // Add to Array and change HTML encoded links into plain text links. + $links[$content_path . $url_decoded][] = $url; + } + // Relative URLs like "./foo/bar" and "../foo/bar". + elseif (!empty($absolute_content_path) && preg_match('!^\.{1,2}/!', $url_decoded)) { + // Build the URI without hostname before the URI is normalized and + // dot-segments will be removed. The hostname is added back after the + // normalization has completed to prevent hostname removal by the regex. + // This logic intentionally does not implement all the rules definied in + // RFC 3986, section 5.2.4 to show broken links and over-dot-segmented + // URIs; e.g. http://example.com/../../foo/bar. + // For more information, see http://drupal.org/node/832388. + $path = substr_replace($absolute_content_path . $url_decoded, '', 0, strlen($base_root)); + + // Remove './' segments where possible. + $path = str_replace('/./', '/', $path); + + // Remove '../' segments where possible. Loop until all segments are + // removed. Taken over from _drupal_build_css_path() in common.inc. + $last = ''; + while ($path != $last) { + $last = $path; + $path = preg_replace('`(^|/)(?!\.\./)([^/]+)/\.\./`', '$1', $path); + } + + // Glue the hostname and path to full-qualified URI. + $links[$base_root . $path][] = $url; + } + // Relative URLs like "test.png". + elseif (!empty($absolute_content_path) && preg_match('!^[^/]!', $url_decoded)) { + $links[$absolute_content_path . $url_decoded][] = $url; + } + else { + // TODO: Are there more special cases the module need to handle? + } + } + } + + return $links; +} + +/** + * Replaces old link with new link in text. + * + * @param string $text + * The text a link is inside. Passed in as a reference. + * @param string $old_link_fqdn + * The old link to search for in strings. + * @param string $new_link_fqdn + * The old link should be overwritten with this new link. + */ +function _linkchecker_link_replace(&$text, $old_link_fqdn = '', $new_link_fqdn = '') { + // Don't do any string replacement if one of the values is empty. + if (!empty($text) && !empty($old_link_fqdn) && !empty($new_link_fqdn)) { + // Remove protocols and hostname from local URLs. + $base_roots = array( + drupal_strtolower('http://'. $_SERVER['HTTP_HOST']), + drupal_strtolower('https://'. $_SERVER['HTTP_HOST']) + ); + $old_link = str_replace($base_roots, '', $old_link_fqdn); + $new_link = str_replace($base_roots, '', $new_link_fqdn); + + // Build variables with all URLs and run check_url() only once. + $old_html_link_fqdn = check_url($old_link_fqdn); + $new_html_link_fqdn = check_url($new_link_fqdn); + $old_html_link = check_url($old_link); + $new_html_link = check_url($new_link); + + // Replace links in CCK link and text and Links weblink fields. + if (in_array($text, array($old_html_link_fqdn, $old_html_link, $old_link_fqdn, $old_link))) { + // Keep old and new links in the same encoding and format and short or fully qualified. + $text = str_replace($old_html_link_fqdn, $new_html_link_fqdn, $text); + $text = str_replace($old_html_link, $new_html_link, $text); + $text = str_replace($old_link_fqdn, $new_link_fqdn, $text); + $text = str_replace($old_link, $new_link, $text); + } + else { + // Create an array of preg quoted links with HTML decoded and encoded URLs. + $old_links_quoted = array(); + $old_links_quoted[] = preg_quote($old_html_link_fqdn, '/'); + $old_links_quoted[] = preg_quote($old_html_link, '/'); + $old_links_quoted[] = preg_quote($old_link, '/'); + + // Remove duplicate URLs from array if URLs do not have URL parameters. + // If more than one URL parameter exists - one URL in the array will have + // an unencoded ampersand "&" and a second URL will have an HTML encoded + // ampersand "&". + $regex_old_links = implode('|', array_unique($old_links_quoted)); + + // Create array to fill with replacement rules. + $replacements = array(); + + // Add replace rules for a/area tags. + if (variable_get('linkchecker_extract_from_a', 1) == 1) { + // TODO: If link text between opening an closing a-tag having the same + // URL, also replace the link text. Create a replace regex for this task. + $text = str_replace(array('>' . $old_html_link_fqdn . '', '>' . $old_html_link . '', '>' . $old_link . ''), '>' . $new_html_link . '', $text); + $replacements['/(<(a|area)\s[^>]*href=["\'])('. $regex_old_links .')(["\'][^>]*>)/i'] = '\1'. $new_html_link .'\4'; + } + + // Add replace rules for audio tags. + if (variable_get('linkchecker_extract_from_audio', 0) == 1) { + $replacements['/(]*src=["\'])('. $regex_old_links .')(["\'][^>]*>)/i'] = '\1'. $new_html_link .'\3'; + } + + // Add replace rules for embed tags. + if (variable_get('linkchecker_extract_from_embed', 0) == 1) { + $replacements['/(]*src=["\'])('. $regex_old_links .')(["\'][^>]*>)/i'] = '\1'. $new_html_link .'\3'; + $replacements['/(]*pluginurl=["\'])('. $regex_old_links .')(["\'][^>]*>)/i'] = '\1'. $new_html_link .'\3'; + $replacements['/(]*pluginspage=["\'])('. $regex_old_links .')(["\'][^>]*>)/i'] = '\1'. $new_html_link .'\3'; + } + + // Add replace rules for iframe tags. + if (variable_get('linkchecker_extract_from_iframe', 0) == 1) { + $replacements['/(]*src=["\'])('. $regex_old_links .')(["\'][^>]*>)/i'] = '\1'. $new_html_link .'\3'; + } + + // Add replace rules for img tags. + if (variable_get('linkchecker_extract_from_img', 0) == 1) { + $replacements['/(]*src=["\'])('. $regex_old_links .')(["\'][^>]*>)/i'] = '\1'. $new_html_link .'\3'; + } + + // Add replace rules for object/param tags. + if (variable_get('linkchecker_extract_from_object', 0) == 1) { + $replacements['/(]*data=["\'])('. $regex_old_links .')(["\'][^>]*>)/i'] = '\1'. $new_html_link .'\3'; + $replacements['/(]*codebase=["\'])('. $regex_old_links .')(["\'][^>]*>)/i'] = '\1'. $new_html_link .'\3'; + $replacements['/(]*((name|src)=["\'](archive|filename|href|movie|src|url)["\']\s[^>]*)+value=["\'])('. $regex_old_links .')(["\'][^>]*>)/i'] = '\1'. $new_html_link .'\6'; + } + + // Add replace rules for source tags. + if (variable_get('linkchecker_extract_from_source', 0) == 1) { + $replacements['/(]*src=["\'])('. $regex_old_links .')(["\'][^>]*>)/i'] = '\1'. $new_html_link .'\3'; + } + + // Add replace rules for video tags. + if (variable_get('linkchecker_extract_from_video', 0) == 1) { + $replacements['/(]*poster=["\'])('. $regex_old_links .')(["\'][^>]*>)/i'] = '\1'. $new_html_link .'\3'; + $replacements['/(]*src=["\'])('. $regex_old_links .')(["\'][^>]*>)/i'] = '\1'. $new_html_link .'\3'; + } + + // Replace link by applying all replacement rules on text. + foreach ($replacements as $pattern => $replacement) { + $text = preg_replace($pattern, $replacement, $text); + } + } + } +} + +/** + * Customized clone of core check_markup() function with additional filter blacklist. + * + * See http://api.drupal.org/api/function/check_markup for API documentation. + */ +function _linkchecker_check_markup($text, $format = FILTER_FORMAT_DEFAULT, $check = TRUE) { + // When $check = TRUE, do an access check on $format. + if (isset($text) && (!$check || filter_access($format))) { + $format = filter_resolve_format($format); + + // Check for a cached version of this piece of text. + $cache_id = 'linkchecker:' . $format . ':' . md5($text); + if ($cached = cache_get($cache_id, 'cache_filter')) { + return $cached->data; + } + + // See if caching is allowed for this format. + $cache = filter_format_allowcache($format); + + // Convert all Windows and Mac newlines to a single newline, + // so filters only need to deal with one possibility. + $text = str_replace(array("\r\n", "\r"), "\n", $text); + + // Get a complete list of filters, ordered properly. + $filters = filter_list_format($format); + + // Do not run placeholder or special tag filters used as references + // to nodes like 'weblink' or 'weblinks' node types. If the original + // link node is updated, all links are automatically up-to-date and + // there is no need to notify about the broken link on all nodes having + // a link reference in content. This would only confuse the authors as + // they may also not be able to fix the source node of the reference. + $filters_blacklist = array_keys(array_filter(variable_get('linkchecker_filter_blacklist', explode('|', LINKCHECKER_DEFAULT_FILTER_BLACKLIST)))); + + // Give filters the chance to escape HTML-like data such as code or formulas. + foreach ($filters as $filter) { + if (!in_array($filter->module . '/' . $filter->delta, $filters_blacklist)) { + $text = module_invoke($filter->module, 'filter', 'prepare', $filter->delta, $format, $text, $cache_id); + } + } + + // Perform filtering. + foreach ($filters as $filter) { + if (!in_array($filter->module . '/' . $filter->delta, $filters_blacklist)) { + $text = module_invoke($filter->module, 'filter', 'process', $filter->delta, $format, $text, $cache_id); + } + } + + // Store in cache with a minimum expiration time of 1 day. + if ($cache) { + cache_set($cache_id, $text, 'cache_filter', time() + (60 * 60 * 24)); + } + } + else { + $text = t('n/a'); + } + + return $text; +} + +/** + * Get the path of an URL. + * + * @param string $url + * The http/https URL to parse. + * @return string + * Full qualified URL with absolute path of the URL. + */ +function _linkchecker_absolute_content_path($url) { + + // Parse the URL and make sure we can handle the schema. + $uri = @parse_url($url); + + if ($uri == FALSE) { + return NULL; + } + + if (!isset($uri['scheme'])) { + return NULL; + } + + // Break if the schema is not supported. + if (!in_array($uri['scheme'], array('http', 'https'))) { + return NULL; + } + + $scheme = isset($uri['scheme']) ? $uri['scheme'] . '://' : ''; + $user = isset($uri['user']) ? $uri['user'] . ($uri['pass'] ? ':' . $uri['pass'] : '') . '@' : ''; + $port = isset($uri['port']) ? $uri['port'] : 80; + $host = $uri['host'] . ($port != 80 ? ':'. $port : ''); + $path = isset($uri['path']) ? $uri['path'] : '/'; + + // Glue the URL variables. + $absolute_url = $scheme . $user . $host . $path; + + // Find the last slash and remove all after the last slash to get the path. + $last_slash = strrpos($absolute_url, '/'); + $absolute_content_url = drupal_substr($absolute_url, 0, $last_slash + 1); + + return $absolute_content_url; +} + +/** + * Verifies against the blacklists, if the link status should be checked or not. + */ +function _linkchecker_link_check_status_filter($url) { + $status = TRUE; + + // Domain blacklist check + $urls = variable_get('linkchecker_disable_link_check_for_urls', LINKCHECKER_RESERVED_DOCUMENTATION_DOMAINS); + if (!empty($urls) && preg_match('/' . implode('|', array_map(create_function('$links', 'return preg_quote($links, \'/\');'), preg_split('/(\r\n?|\n)/', $urls))) . '/', $url)) { + $status = FALSE; + } + + // Protocol whitelist check (without curl, only http/https is supported) + if (!preg_match('/^(https?):\/\//i', $url)) { + $status = FALSE; + } + + return $status; +} + +/** + * Defines the list of allowed response codes for form input validation. + * + * @param int $code + * An numeric response code. + * @return + * TRUE if the status code is valid. + */ +function _linkchecker_isvalid_response_code($code) { + + $responses = array( + 100 => 'Continue', + 101 => 'Switching Protocols', + 200 => 'OK', + 201 => 'Created', + 202 => 'Accepted', + 203 => 'Non-Authoritative Information', + 204 => 'No Content', + 205 => 'Reset Content', + 206 => 'Partial Content', + 300 => 'Multiple Choices', + 301 => 'Moved Permanently', + 302 => 'Found', + 303 => 'See Other', + 304 => 'Not Modified', + 305 => 'Use Proxy', + 307 => 'Temporary Redirect', + 400 => 'Bad Request', + 401 => 'Unauthorized', + 402 => 'Payment Required', + 403 => 'Forbidden', + 404 => 'Not Found', + 405 => 'Method Not Allowed', + 406 => 'Not Acceptable', + 407 => 'Proxy Authentication Required', + 408 => 'Request Time-out', + 409 => 'Conflict', + 410 => 'Gone', + 411 => 'Length Required', + 412 => 'Precondition Failed', + 413 => 'Request Entity Too Large', + 414 => 'Request-URI Too Large', + 415 => 'Unsupported Media Type', + 416 => 'Requested range not satisfiable', + 417 => 'Expectation Failed', + 500 => 'Internal Server Error', + 501 => 'Not Implemented', + 502 => 'Bad Gateway', + 503 => 'Service Unavailable', + 504 => 'Gateway Time-out', + 505 => 'HTTP Version not supported', + ); + + return array_key_exists($code, $responses); +} + +/** + * Should the defined node type scanned for links? + * + * @param string $node_type + * Verifies if the node type is enabled for link checks and should be scanned. + * @return + * TRUE if node type should be scanned, otherwise FALSE. + */ +function _linkchecker_scan_nodetype($node_type = NULL) { + + $enabled = FALSE; + $node_types = array_keys(array_filter(variable_get('linkchecker_scan_nodetypes', array()))); + + // Scan specific node types only. + if (in_array($node_type, $node_types)) { + $enabled = TRUE; + } + + return $enabled; +} + +/** + * Unpublishes all nodes having the specified link id. + * + * @param int $lid + * A link ID that have reached a defined failcount. + */ +function _linkchecker_unpublish_nodes($lid) { + $res = db_query("SELECT * FROM {linkchecker_nodes} WHERE lid = %d", $lid); + while ($row = db_fetch_object($res)) { + $node = node_load(array('nid' => $row->nid)); + $node->status = 0; + node_save($node); + watchdog('linkchecker', 'Set @type %title to unpublished.', array('@type' => $node->type, '%title' => $node->title)); + } +} + +/** + * Load comment as array. + */ +function _linkchecker_comment_load($cid) { + return db_fetch_array(db_query('SELECT * FROM {comments} WHERE cid = %d', $cid)); +} + +/** + * Load link as array. + */ +function linkchecker_link_load($lid) { + return db_fetch_array(db_query("SELECT * FROM {linkchecker_links} WHERE lid = %d", $lid)); +} + +/** + * Impersonates another user. http://drupal.org/node/287292#comment-3162350 + * + * Each time this function is called, the active user is saved and $new_user + * becomes the active user. Multiple calls to this function can be nested, + * and session saving will be disabled until all impersonation attempts have + * been reverted using linkchecker_revert_user(). + * + * @param $new_user + * User to impersonate, either a UID or a user object. + * + * @return + * Current user object. + * + * @see linkchecker_revert_user() + */ +function linkchecker_impersonate_user($new_user = NULL) { + global $user; + static $user_original; + + if (!isset($new_user)) { + if (isset($user_original) && !empty($user_original)) { + // Restore the previous user from the stack. + $user = array_pop($user_original); + + // Re-enable session saving if we are no longer impersonating a user. + if (empty($user_original)) { + session_save_session(TRUE); + } + } + } + else { + // Push the original user onto the stack and prevent session saving. + $user_original[] = $user; + session_save_session(FALSE); + + if (is_numeric($new_user)) { + $user = user_load($new_user); + } + else { + $user = is_object($new_user) ? $new_user : (object) $new_user; + } + } + + return $user; +} + +/** + * Reverts to the previous user after impersonating. + * + * @return + * Current user. + * + * @see linkchecker_impersonate_user() + */ +function linkchecker_revert_user() { + return linkchecker_impersonate_user(); +} + +/** + * Backport of drupal_set_time_limit from Drupal 7. + */ +function linkchecker_set_time_limit($time_limit) { + if (function_exists('set_time_limit')) { + @set_time_limit($time_limit); + } +} diff --git a/sites/all/modules/linkchecker/tests/linkchecker_extract_links.test b/sites/all/modules/linkchecker/tests/linkchecker_extract_links.test new file mode 100644 index 0000000..e4f97f7 --- /dev/null +++ b/sites/all/modules/linkchecker/tests/linkchecker_extract_links.test @@ -0,0 +1,441 @@ + t('Link extraction tests'), + 'description' => t('Test Link checker module link extraction functionality.'), + 'group' => 'Link checker', + ); + } + + public function setUp() { + parent::setUp('linkchecker', 'path'); + + $permissions = array( + 'create page content', + 'edit own page content', + 'administer url aliases', + 'create url aliases', + ); + + $user = $this->drupalCreateUser($permissions); + $this->drupalLogin($user); + + } + + public function testLinkCheckerCreateNodeWithLinks() { + + // Enable all node type page for link extraction. + variable_set('linkchecker_scan_nodetypes', array('page')); + variable_set('linkchecker_scan_blocks', 1); + + // Core enables the URL filter for "Full HTML" by default. + // -> Blacklist / Disable URL filter for testing. + variable_set('linkchecker_filter_blacklist', array('filter/2')); + + // Extract from all link checker supported HTML tags. + variable_set('linkchecker_extract_from_a', 1); + variable_set('linkchecker_extract_from_audio', 1); + variable_set('linkchecker_extract_from_embed', 1); + variable_set('linkchecker_extract_from_iframe', 1); + variable_set('linkchecker_extract_from_img', 1); + variable_set('linkchecker_extract_from_object', 1); + variable_set('linkchecker_extract_from_source', 1); + variable_set('linkchecker_extract_from_video', 1); + + $body = << + +Send email +Execute JavaScript + + + + + + +Test image 1 +Test image 2 + +../foo1/bar1 +./foo2/bar2 +../foo3/../foo4/foo5 +./foo4/../foo5/foo6 +./foo4/./foo5/foo6 + + +URL with uncommon chars + + + + +
+ No weather report visible? At MSN + you are able to find the weather report missing here and the + Flash plugin can be found at Adobe. +
+ + + + + + + + + + + + + + + + + + + + + + /video/foo3.mp4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +EOT; + + // Save folder names in variables for reuse. + $folder1 = $this->randomName(10); + $folder2 = $this->randomName(5); + + // Allow the user to use the 'Full HTML' format. + db_query("UPDATE {filter_formats} SET roles = '%s' WHERE format = %d", ',2,', 2); + + // Fill node array. + $edit = array(); + $edit['title'] = $this->randomName(32); + $edit['body'] = $body; + $edit['path'] = $folder1 . '/' . $folder2; + $edit['format'] = 2; + + // Extract only full qualified URLs. + variable_set('linkchecker_fqdn_only', 1); + + // Verify path input field appears on add "Basic page" form. + $this->drupalGet('node/add/page'); + // Verify path input is present. + $this->assertFieldByName('path', '', t('Path input field present on add Basic page form.')); + + // Save node. + $this->drupalPost('node/add/page', $edit, t('Save')); + $this->assertRaw(t('@type %title has been created.', array('@type' => 'Page', '%title' => $edit['title'])), t('Node was created.')); + + // Verify if the content links are extracted correctly. + $urls_fqdn = array( + 'http://www.lagrandeepicerie.fr/#e-boutique/Les_produits_du_moment,2/coffret_vins_doux_naturels,149', + 'http://wetterservice.msn.de/phclip.swf?zip=60329&ort=Frankfurt', + 'http://www.msn.de/', + 'http://www.adobe.com/', + 'http://www.apple.com/qtactivex/qtplugin.cab', + 'http://example.net/video/foo1.mov', + 'http://example.net/video/foo2.mov', + 'http://example.net/video/foo3.mov', + 'http://example.org/video/foo1.mp4', + 'http://example.org/video/foo2.mp4', + 'http://example.org/video/foo3.mp4', + 'http://example.org/video/foo4.mp4', + 'http://example.org/video/foo5.mp4', + 'http://example.org/video/foo6.mp4', + 'http://example.org/video/player1.swf', + 'http://example.org/video/player2.swf', + 'http://example.org/video/player3.swf', + 'http://example.com/iframe/', + 'http://www.theora.org/cortado.jar', + 'http://v2v.cc/~j/theora_testsuite/pixel_aspect_ratio.ogg', + 'http://v2v.cc/~j/theora_testsuite/pixel_aspect_ratio.mov', + 'http://v2v.cc/~j/theora_testsuite/320x240.ogg', + ); + + foreach ($urls_fqdn as $org_url => $check_url) { + $link = $this->getLinkCheckerLink($check_url); + if ($link) { + $this->assertIdentical($link->url, $check_url, t('Absolute URL %org_url matches expected result %check_url.', array('%org_url' => $org_url, '%check_url' => $check_url))); + } + else { + $this->fail(t('URL %check_url not found.', array('%check_url' => $check_url))); + } + } + + // Check if the number of links is correct. + // - Verifies if all HTML tag regexes matched. + // - Verifies that the linkchecker filter blacklist works well. + $urls_in_database = $this->getLinkCheckerLinksCount(); + $urls_expected_count = count($urls_fqdn); + $this->assertEqual($urls_in_database, $urls_expected_count, t('Found @urls_in_database URLs in database matches expected result of @urls_expected_count.', array('@urls_in_database' => $urls_in_database, '@urls_expected_count' => $urls_expected_count))); + + // Extract all URLs including relative path. + variable_set('clean_url', 1); + variable_set('linkchecker_fqdn_only', 0); + + $node = $this->drupalGetNodeByTitle($edit['title']); + $this->assertTrue($node, t('Node found in database.')); + $this->drupalPost('node/' . $node->nid . '/edit', $edit, t('Save')); + $this->assertRaw(t('@type %title has been updated.', array('@type' => 'Page', '%title' => $edit['title']))); + + // FIXME: Path alias seems not saved!??? + // $this->assertIdentical($node->path, $edit['path'], t('URL alias "@node-path" matches path "@edit-path".', array('@node-path' => $node->path, '@edit-path' => $edit['path']))); + + // DEBUG + $linkchecker_links = db_query("SELECT * FROM {linkchecker_links}"); + while ($row = db_fetch_array($linkchecker_links)) { + $rows[] = $row['url']; + } + $this->verbose(theme_item_list($rows, 'URLs in database:')); + // $this->fail('DEBUG: ' .implode('
', $rows)); + + // Verify if the content links are extracted correctly. + global $base_root, $base_path; + $urls_relative = array( + '../foo1/test.png' => $base_root . $base_path . 'foo1/test.png', + 'test.png' => $base_root . $base_path . $folder1 . '/test.png', + '../foo1/bar1' => $base_root . $base_path . 'foo1/bar1', + './foo2/bar2' => $base_root . $base_path . $folder1 . '/foo2/bar2', + '../foo3/../foo4/foo5' => $base_root . $base_path . 'foo4/foo5', + './foo4/../foo5/foo6' => $base_root . $base_path . $folder1 . '/foo5/foo6', + './foo4/./foo5/foo6' => $base_root . $base_path . $folder1 . '/foo4/foo5/foo6', + 'flash.png' => $base_root . $base_path . $folder1 . '/flash.png', + 'ritmo.mid' => $base_root . $base_path . $folder1 . '/ritmo.mid', + 'my_ogg_video.ogg' => $base_root . $base_path . $folder1 . '/my_ogg_video.ogg', + 'video.ogv' => $base_root . $base_path . $folder1 . '/video.ogv', + 'flvplayer1.swf' => $base_root . $base_path . $folder1 . '/flvplayer1.swf', + 'flvplayer2.swf' => $base_root . $base_path . $folder1 . '/flvplayer2.swf', + 'foo.ogg' => $base_root . $base_path . $folder1 . '/foo.ogg', + ); + $this->verbose(theme_item_list($urls_relative, 'Verify if following relative URLs exists:')); + + foreach ($urls_relative as $org_url => $check_url) { + $link = $this->getLinkCheckerLink($check_url); + if ($link) { + $this->assertIdentical($link->url, $check_url, t('Relative URL %org_url matches expected result %check_url.', array('%org_url' => $org_url, '%check_url' => $check_url))); + } + else { + $this->fail(t('URL %check_url not found.', array('%check_url' => $check_url))); + } + } + + // Check if the number of links is correct. + $urls_in_database = $this->getLinkCheckerLinksCount(); + $urls_expected_count = count($urls_fqdn + $urls_relative); + $this->assertEqual($urls_in_database, $urls_expected_count, t('Found @urls_in_database URLs in database matches expected result of @urls_expected_count.', array('@urls_in_database' => $urls_in_database, '@urls_expected_count' => $urls_expected_count))); + + + // Verify if link check has been enabled for normal URLs. + $urls = array( + 'http://www.lagrandeepicerie.fr/#e-boutique/Les_produits_du_moment,2/coffret_vins_doux_naturels,149', + 'http://wetterservice.msn.de/phclip.swf?zip=60329&ort=Frankfurt', + 'http://www.msn.de/', + 'http://www.adobe.com/', + 'http://www.apple.com/qtactivex/qtplugin.cab', + 'http://www.theora.org/cortado.jar', + 'http://v2v.cc/~j/theora_testsuite/pixel_aspect_ratio.ogg', + 'http://v2v.cc/~j/theora_testsuite/pixel_aspect_ratio.mov', + 'http://v2v.cc/~j/theora_testsuite/320x240.ogg', + $base_root . $base_path . 'foo1/test.png', + $base_root . $base_path . $folder1 . '/test.png', + $base_root . $base_path . 'foo1/bar1', + $base_root . $base_path . $folder1 . '/foo2/bar2', + $base_root . $base_path . 'foo4/foo5', + $base_root . $base_path . $folder1 . '/foo5/foo6', + $base_root . $base_path . $folder1 . '/foo4/foo5/foo6', + $base_root . $base_path . $folder1 . '/flash.png', + $base_root . $base_path . $folder1 . '/ritmo.mid', + $base_root . $base_path . $folder1 . '/my_ogg_video.ogg', + $base_root . $base_path . $folder1 . '/video.ogv', + $base_root . $base_path . $folder1 . '/flvplayer1.swf', + $base_root . $base_path . $folder1 . '/flvplayer2.swf', + $base_root . $base_path . $folder1 . '/foo.ogg', + ); + + foreach ($urls as $url) { + $this->assertTrue($this->getLinkcheckerLink($url)->status, t('Link check for %url is enabled.', array('%url' => $url))); + } + + + // Verify if link check has been disabled for example.com/net/org URLs. + $documentation_urls = array( + 'http://example.net/video/foo1.mov', + 'http://example.net/video/foo2.mov', + 'http://example.net/video/foo3.mov', + 'http://example.org/video/foo1.mp4', + 'http://example.org/video/foo2.mp4', + 'http://example.org/video/foo3.mp4', + 'http://example.org/video/foo4.mp4', + 'http://example.org/video/foo5.mp4', + 'http://example.org/video/foo6.mp4', + 'http://example.org/video/player1.swf', + 'http://example.org/video/player2.swf', + 'http://example.org/video/player3.swf', + 'http://example.com/iframe/', + ); + + foreach ($documentation_urls as $documentation_url) { + $this->assertFalse($this->getLinkcheckerLink($documentation_url)->status, t('Link check for %url is disabled.', array('%url' => $documentation_url))); + } + + } + + /** + * Get linkchecker link by url. + * + * @param string $url + * URL of the link to find. + * @return object + * Link object. + */ + function getLinkCheckerLink($url) { + return db_fetch_object(db_query("SELECT * FROM {linkchecker_links} WHERE urlhash = '%s'", md5($url))); + } + + /** + * Get the current number of links in linkchecker_links table. + */ + function getLinkCheckerLinksCount() { + return db_result(db_query("SELECT COUNT(1) FROM {linkchecker_links}")); + } +} + +/** + * Test case for impersonating users. + * + * Based on http://drupal.org/node/287292#comment-3162350 + */ +class LinkCheckerUserImpersonatingUserTestCase extends DrupalWebTestCase { + + public static function getInfo() { + return array( + 'name' => 'Link checker impersonate users', + 'description' => 'Temporarily impersonate another user, and then restore the original user.', + 'group' => 'Link checker', + ); + } + + function setUp() { + parent::setUp('linkchecker'); + } + + function testLinkCheckerImpersonateUser() { + global $user; + $original_user = $user; + + // If not currently logged in, use linkchecker_impersonate_user() to switch to + // user 1. If logged in, switch to the anonymous user instead. + if (user_is_anonymous()) { + linkchecker_impersonate_user(1); + } + else { + linkchecker_impersonate_user(0); + } + + // Verify that the active user has changed, and that session saving is + // disabled. + $this->assertEqual($user->uid, ($original_user->uid == 0 ? 1 : 0), t('User switched')); + $this->assertFalse(session_save_session(), t('Session saving is disabled.')); + + // Perform a second (nested) impersonation. + linkchecker_impersonate_user(1); + $this->assertEqual($user->uid, 1, t('User switched.')); + + // Revert to the user which was active between the first and second + // impersonation attempt. + linkchecker_revert_user(); + + // Since we are still impersonating the user from the first attempt, + // session handling still needs to be disabled. + $this->assertEqual($user->uid, ($original_user->uid == 0 ? 1 : 0), t('User switched.')); + $this->assertFalse(session_save_session(), t('Session saving is disabled.')); + + // Revert to the original user which was active before the first + // impersonation attempt. + linkchecker_revert_user(); + + // Assert that the original user is the active user again, and that session + // saving has been re-enabled. + $this->assertEqual($user->uid, $original_user->uid, t('Original user successfully restored.')); + + // Simpletest uses linkchecker_impersonate_user() too, revert the impersonation by + // Simpletest to enable session saving again. This is safe because calling + // linkchecker_revert_user() too often simply results in returning the active user. + linkchecker_revert_user(); + $this->assertTrue(session_save_session(), t('Session saving is enabled.')); + } +}